NodeJS 使用puppeteer抓取nba.com页面

6yt4nkrj  于 2023-04-20  发布在  Node.js
关注(0)|答案(2)|浏览(116)

我试图从https://www.nba.com/stats/teams/isolation?PerMode=Totals&TypeGrouping=offensive中抓取一个表,但总是得到空数组。我想用类Crom_body__UYOcU抓取tbody的每个tr中的每个td
我使用nodejs和puppeteer,下面是我的代码

const puppeteer = require('puppeteer');

(async () => {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();
  await page.goto('https://www.nba.com/stats/teams/isolation?PerMode=Totals&TypeGrouping=offensive');

  const data = await page.evaluate(() => {
    const rows = Array.from(document.querySelectorAll('tbody.Crom_body__UYOcU tr'));

    return rows.map((row) => {
      const tds = Array.from(row.querySelectorAll('td'));
      return tds.map(td => td.textContent.trim());
    });
  });

  console.log(data); // an array of arrays containing the text content of each cell in each row
  await browser.close();
})();

谢谢你的帮助

c3frrgcw

c3frrgcw1#

使用散列类名抓取HTML可能比使用站点自己的API脆弱得多。
您不需要puppeteer来获取该表中的数据-您甚至不需要抓取它:它是由从以下地址(我在dev tools network panel中找到的)获取的未经身份验证的数据填充的:

https://stats.nba.com/stats/synergyplaytypes?LeagueID=00&PerMode=Totals&PlayType=Isolation&PlayerOrTeam=T&SeasonType=Regular%20Season&SeasonYear=2022-23&TypeGrouping=offensive

看起来服务器只需要适当的Referer header就可以获得带有JSON主体的响应,所以你可以简单地以这种方式请求数据:
example.mjs

const response = await fetch(
  "https://stats.nba.com/stats/synergyplaytypes?LeagueID=00&PerMode=Totals&PlayType=Isolation&PlayerOrTeam=T&SeasonType=Regular%20Season&SeasonYear=2022-23&TypeGrouping=offensive",
  { headers: new Headers([["Referer", "https://www.nba.com/"]]) },
);

const data = await response.json();

// The actual data:
const resultSet = data.resultSets[0];

const teamNameIndex = resultSet.headers.indexOf("TEAM_NAME");
const teamNames = resultSet.rowSet.map((array) => array[teamNameIndex]);

console.log("first five teams:", teamNames.slice(0, 5));

console.log("entire payload:", JSON.stringify(data, null, 2));

在终端:

% node --version  
v18.16.0

% node example.mjs
first five teams: [
  'Dallas Mavericks',
  'Philadelphia 76ers',
  'Brooklyn Nets',
  'New York Knicks',
  'Oklahoma City Thunder'
]
entire payload: {
  "resource": "synergyplaytype",
  "parameters": {
    "LeagueID": "00",
    "SeasonYear": "2022-23",
    "SeasonType": "Regular Season",
    "PerMode": "Totals",
    "PlayerOrTeam": "T",
    "PlayType": "Isolation",
    "TypeGrouping": "offensive"
  },
  "resultSets": [
    {
      "name": "SynergyPlayType",
      "headers": [
        "SEASON_ID",
        "TEAM_ID",
        "TEAM_ABBREVIATION",
        "TEAM_NAME",
        "PLAY_TYPE",
        "TYPE_GROUPING",
        "PERCENTILE",
        "GP",
        "POSS_PCT",
        "PPP",
        "FG_PCT",
        "FT_POSS_PCT",
        "TOV_POSS_PCT",
        "SF_POSS_PCT",
        "PLUSONE_POSS_PCT",
        "SCORE_POSS_PCT",
        "EFG_PCT",
        "POSS",
        "PTS",
        "FGM",
        "FGA",
        "FGMX"
      ],
      "rowSet": [
        [
          "22022",
          1610612742,
          "DAL",
          "Dallas Mavericks",
          "Isolation",
          "Offensive",
          0.828,
          82,
          0.122,
          1.018,
          0.423,
          0.169,
          0.07,
          0.15,
          0.031,
          0.466,
          0.485,
          1064,
          1083,
          356,
          842,
          486
        ],
        [
          "22022",
          1610612755,
          "PHI",
          "Philadelphia 76ers",
          "Isolation",
          "Offensive",
          ---snip---
        ],
        ---snip---
      ]
    }
  ]
}
ttygqcqt

ttygqcqt2#

我看到两个问题:
1.如果您在headless模式下使用默认的用户代理,该网站将阻止您作为机器人。
1.表数据是在页面加载之后动态添加的,因此您需要使用类似page.waitForSelector的调用来等待它的到来。
代码如下:

const puppeteer = require("puppeteer"); // ^19.7.2

let browser;
(async () => {
  browser = await puppeteer.launch();
  const [page] = await browser.pages();
  const url =
    "https://www.nba.com/stats/teams/isolation?PerMode=Totals&TypeGrouping=offensive";
  const ua =
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36";
  await page.setUserAgent(ua);
  await page.goto(url, {waitUntil: "domcontentloaded"});
  const bodySel = ".nba-stats-content-block tr";
  await page.waitForSelector(bodySel);
  const data = await page.$$eval(bodySel, rows =>
    rows.map(row =>
      [...row.querySelectorAll("td")]
        .map(cell => cell.textContent.trim())
    )
  );
  console.log(data);
})()
  .catch(err => console.error(err))
  .finally(() => browser?.close());

请注意,我使用的选择器看起来更稳定,并且最后没有随机数。如果只有一个表,普通的"tr""td"可能更好。
如果你想在你的行上添加标题,试着在const data = ...之后添加这个:

// ...
  const headers = await page.$$eval(
    ".nba-stats-content-block th",
    els => els.map(e => e.textContent.trim())
  );
  const result = data.map(row =>
    Object.fromEntries(headers.map((x, i) => [x, row[i]]))
  );
  console.log(result);
// ...

要阻止不必要的请求以提高速度,您可以用途:

// ...
  await page.setRequestInterception(true);
  const allowed = [
    url,
    "https://www.nba.com/_next",
    "https://stats.nba.com/",
  ];
  page.on("request", request => {
    if (allowed.some(e => request.url().startsWith(e))) {
      request.continue();
    }
    else {
      request.abort();
    }
  });
  await page.goto(url, {waitUntil: "domcontentloaded"});
// ...

如果检查响应,可以看到包含表数据的API调用:
https://stats.nba.com/stats/synergyplaytypes?LeagueID=00&PerMode=Totals&PlayType=Isolation&PlayerOrTeam=T&SeasonType=Regular%20Season&SeasonYear=2022-23&TypeGrouping=offensive
您可以拦截该响应,而不是从DOM中抓取数据:

// ...
  const responseP = page.waitForResponse(response =>
    response
      .request()
      .url()
      .startsWith("https://stats.nba.com/")
  );
  await page.goto(url, {waitUntil: "domcontentloaded"});
  const response = await responseP;
  const data = await response.json();
  console.log(data.resultSets[0].rowSet);
// ...

您可以选择使用上面相同的“zip”逻辑将标题Map到行。
现在,事实证明所有这些都只是为了指导目的。在这种情况下,API URL是不受保护的,我们可以简单地通过一个简单的HTTP请求来访问它,而不需要Puppeteer,正如this answer很好地说明的那样。
请参阅this tutorial,了解如何找到这样的未受保护的端点。请注意,并不总是可以访问它们,因此您通常需要使用本文中的策略。

相关问题