Puppeter多次返回最近的对象

ckocjqey  于 2021-09-13  发布在  Java
关注(0)|答案(0)|浏览(248)

问题:当我使用Puppeter进行web抓取时,当我正在抓取的对象中的图像丢失时,抓取器会将最近的对象输出到数据库,而不是忽略它并移动到下一个带有图像的对象。
问题:我如何从网站上刮下这些对象,而忽略那些没有标签的对象 companyImage ?

async function scrapeListings(page) {
  //navigating to the list of jobs
  try {
    await page.goto("https://www.glassdoor.co.uk/index.htm");
    await page.click(
      "#SiteNav > nav > div.d-lg-none.d-flex.align-items-center.justify-content-between.px-std.py-xsm.px-md-lg.py-md-std.LockedHomeHeaderStyles__bottomBorder.LockedHomeHeaderStyles__fullWidth > div.d-flex.justify-content-center.order-1.order-md-2.LockedHomeHeaderStyles__flexibleContainer > button",
      { delay: 200 }
    );
    await page.type("#userEmail", "*******", {
      delay: 200,
    });
    await page.type("#userPassword", "*******", { delay: 200 });
    await page.click(".mt-std.d-flex.flex-column.align-items-center", {
      delay: 200,
    });

    await page.waitForNavigation();

    // manipulating the DOM

    await page.goto(
      "https://www.glassdoor.co.uk/Job/london-internship-jobs-SRCH_IL.0,6_IC2671300_KO7,17_IP4.htm?fromAge=30"
    );
    // await page.goto(
    //   `https://www.glassdoor.co.uk/Job/london-internship-jobs-SRCH_IL.0,6_IC2671300_KO7,17_IP${index}.htm?fromAge=30`
    // );

    const html = await page.content();
    const $ = cheerio.load(html);
    const listings = $("[data-test='jobListing']")
      .map((index, element) => {
        const titleElement = $(element).find(".css-l2wjgv.e1n63ojh0.jobLink");
        const timeElement = $(element).find("[data-test='job-age']");
        const companyName = $(titleElement).text();

        const url =
          "https://www.glassdoor.co.uk" + $(titleElement).attr("href");
        const datePosted = $(timeElement).text();

        return { companyName, url, datePosted };
      })
      .get();
    return listings;
    await page.waitForNavigation({ waitUntil: "networkidle0" });
  } catch (erro) {
    console.error(erro);
  }
}

async function scrapeJobDescriptions(listings, page) {
  for (var i = 0; i < listings.length; i++) {
    await page.goto(listings[i].url);

    const html = await page.content();
    const $ = cheerio.load(html);
    // const jobDescription = $(".desc").text();
    const jobDescription = $("#JobDescriptionContainer").html();
    const location = $(
      ".css-f4rs18.css-1e169oc.efy8art2 > div > div > div:nth-child(3)"
    ).text();
    const jobSalary = $(
      ".css-1v5elnn.e11nt52q2 .small.css-10zcshf.e1v3ed7e1"
    ).text();
    const jobPosition = $(".css-17x2pwl").text();
    const applyLinkRedirect =
      "https://www.glassdoor.co.uk" +
      $(".css-0.e1h54cx80 a").attr("data-job-url");

    // if (applyLink == "https://www.glassdoor.co.ukundefined") {
    //   return err;
    //   console.log(err)
    // }

    try {
      const companyImage = await page.$eval(
        // ".oc-photo-gallery .photo__10vsfGte img",
        ".css-13u5hxa.epu0oo22 img",
        (img) => img.src
      );
      const applyPart = $(".css-0.e1h54cx80 a").attr("data-job-url");

      listings[i].jobDescription = jobDescription;
      listings[i].location = location;
      listings[i].jobSalary = jobSalary;
      listings[i].jobPosition = jobPosition;
      listings[i].applyPart = applyPart;
      // listings[i].applyLink = applyLink;
      listings[i].companyImage = companyImage;

      console.log(listings[i].jobDescription);

      try {
        await page.goto(applyLinkRedirect, { waitUntil: "networkidle0" });
        const applyLink = await page.url();
        listings[i].applyLink = applyLink;
      } catch (err) {
        console.log(err);
      }

      const listingModel = new GlassdoorDB(listings[i]);
      await listingModel.save();
      await sleep(1000); //1 second sleep
    } catch (err) {
      console.log(err);
    }
  }
}

暂无答案!

目前还没有任何答案,快来回答吧!

相关问题