问题:当我使用Puppeter进行web抓取时,当我正在抓取的对象中的图像丢失时,抓取器会将最近的对象输出到数据库,而不是忽略它并移动到下一个带有图像的对象。
问题:我如何从网站上刮下这些对象,而忽略那些没有标签的对象 companyImage
?
async function scrapeListings(page) {
//navigating to the list of jobs
try {
await page.goto("https://www.glassdoor.co.uk/index.htm");
await page.click(
"#SiteNav > nav > div.d-lg-none.d-flex.align-items-center.justify-content-between.px-std.py-xsm.px-md-lg.py-md-std.LockedHomeHeaderStyles__bottomBorder.LockedHomeHeaderStyles__fullWidth > div.d-flex.justify-content-center.order-1.order-md-2.LockedHomeHeaderStyles__flexibleContainer > button",
{ delay: 200 }
);
await page.type("#userEmail", "*******", {
delay: 200,
});
await page.type("#userPassword", "*******", { delay: 200 });
await page.click(".mt-std.d-flex.flex-column.align-items-center", {
delay: 200,
});
await page.waitForNavigation();
// manipulating the DOM
await page.goto(
"https://www.glassdoor.co.uk/Job/london-internship-jobs-SRCH_IL.0,6_IC2671300_KO7,17_IP4.htm?fromAge=30"
);
// await page.goto(
// `https://www.glassdoor.co.uk/Job/london-internship-jobs-SRCH_IL.0,6_IC2671300_KO7,17_IP${index}.htm?fromAge=30`
// );
const html = await page.content();
const $ = cheerio.load(html);
const listings = $("[data-test='jobListing']")
.map((index, element) => {
const titleElement = $(element).find(".css-l2wjgv.e1n63ojh0.jobLink");
const timeElement = $(element).find("[data-test='job-age']");
const companyName = $(titleElement).text();
const url =
"https://www.glassdoor.co.uk" + $(titleElement).attr("href");
const datePosted = $(timeElement).text();
return { companyName, url, datePosted };
})
.get();
return listings;
await page.waitForNavigation({ waitUntil: "networkidle0" });
} catch (erro) {
console.error(erro);
}
}
async function scrapeJobDescriptions(listings, page) {
for (var i = 0; i < listings.length; i++) {
await page.goto(listings[i].url);
const html = await page.content();
const $ = cheerio.load(html);
// const jobDescription = $(".desc").text();
const jobDescription = $("#JobDescriptionContainer").html();
const location = $(
".css-f4rs18.css-1e169oc.efy8art2 > div > div > div:nth-child(3)"
).text();
const jobSalary = $(
".css-1v5elnn.e11nt52q2 .small.css-10zcshf.e1v3ed7e1"
).text();
const jobPosition = $(".css-17x2pwl").text();
const applyLinkRedirect =
"https://www.glassdoor.co.uk" +
$(".css-0.e1h54cx80 a").attr("data-job-url");
// if (applyLink == "https://www.glassdoor.co.ukundefined") {
// return err;
// console.log(err)
// }
try {
const companyImage = await page.$eval(
// ".oc-photo-gallery .photo__10vsfGte img",
".css-13u5hxa.epu0oo22 img",
(img) => img.src
);
const applyPart = $(".css-0.e1h54cx80 a").attr("data-job-url");
listings[i].jobDescription = jobDescription;
listings[i].location = location;
listings[i].jobSalary = jobSalary;
listings[i].jobPosition = jobPosition;
listings[i].applyPart = applyPart;
// listings[i].applyLink = applyLink;
listings[i].companyImage = companyImage;
console.log(listings[i].jobDescription);
try {
await page.goto(applyLinkRedirect, { waitUntil: "networkidle0" });
const applyLink = await page.url();
listings[i].applyLink = applyLink;
} catch (err) {
console.log(err);
}
const listingModel = new GlassdoorDB(listings[i]);
await listingModel.save();
await sleep(1000); //1 second sleep
} catch (err) {
console.log(err);
}
}
}
暂无答案!
目前还没有任何答案,快来回答吧!