Scrapy未跟随下一页

ymdaylpp  于 2023-05-17  发布在  其他
关注(0)|答案(1)|浏览(221)

我在这个问题上坐了很长一段时间了,但我所做的一切都不起作用。我的目标是简单地从招聘网站中提取数据。每个站点提供20个工作岗位。我提取数据的每一个提供使用scrappy回调。这或多或少起作用。问题是scrappy不跳到下一页,无论我怎么尝试。我第一次尝试scrappy & selenium,不起作用。现在我只在scrapy和后续教程中尝试它,但它仍然只从第1页的前20个报价中提取数据。
重要提示:next按钮在整个页面中会发生变化,这意味着它的xpath/css选择器会发生变化。我尝试了css last-nth-child和xpath last()-1,但没有令人满意的结果。更困难的是,在变量xpath元素a之后,后面跟着一个带有链接的标记。
代码如下:

import scrapy
from random import randint
from time import sleep

class WorkpoolJobsSpider(scrapy.Spider):
name = 'getdata'
allowed_domains = ['workpool-jobs.ch']
start_urls = ['https://www.workpool-jobs.ch/recht-jobs']

def parse(self, response):
    SET_SELECTOR = "//p[@class='inserattitel h2 mt-0']/a/@href"
    for joboffer in response.xpath(SET_SELECTOR):
        url1 = response.urljoin(joboffer.get())
        yield scrapy.Request(url1, callback = self.parse_dir_contents)

    next_page = response.xpath(".//li[@class='page-item'][last()-1]/../@href").get()
    wait(randint(5,10))
    if next_page:
        yield response.follow(url=next_page, callback=self.parse)

def parse_dir_contents(self, response):
    single_info = response.xpath(".//*[@class='col-12 col-md mr-md-3 mr-xl-5']")

    for info in single_info:
        info_Titel = info.xpath(".//article/h1[@class='inserattitel']/text()").extract_first()
        info_Berufsfelder = info.xpath(".//article/div[@class='border-top-grau']/p/text()").extract()
        info_Arbeitspensum = info.xpath(".//article/div[@class='row bg-hellstblau']/div[@class='col-12 col-sm-6 col-lg-5']/dl/dd[1]/text()").extract_first()
        info_Anstellungsverhältnis = info.xpath(".//article/div[@class='row bg-hellstblau']/div[@class='col-12 col-sm-6 col-lg-5']/dl/dd[2]/text()").extract_first()
        info_Arbeitsort = info.xpath(".//article/div[@class='row bg-hellstblau']/div[@class='col-12 col-sm-6 col-lg-5']/dl/dd[4]/a/text()").extract()
        info_VerfügbarAb = info.xpath(".//article/div[@class='row bg-hellstblau']/div[@class='col-12 col-sm-6 col-lg-5']/dl/dd[5]/text()").extract()
        info_Kompetenzenqualifikation = info.xpath(".//article/div[@class='row bg-hellstblau']/div[@class='col-12 col-sm-6 col-lg-7']/dl[2]/dd/text()").extract_first()
        info_Aufgabengebiet = info.xpath(".//article/div[@class='border-bottom-grau'][1]//*[self::p or self::li]").extract()
        info_Erwartungen = info.xpath(".//article/div[@class='border-bottom-grau'][2]/ul/li[descendant-or-self::text()]").extract()
        info_WirBietenIhnen = info.xpath(".//article/div[@class='border-bottom-grau'][3]/ul/li[descendant-or-self::text()]").extract()
        info_Publikationsdatum = info.xpath(".//article/footer[@class='inseratfooter']/p[1]/strong/text()").extract_first()

        yield {'Titel': info_Titel,
        'Berufsfelder': info_Berufsfelder,
        'Arbeitspensum': info_Arbeitspensum,
        'Anstellungsverhältnis': info_Anstellungsverhältnis,
        'Arbeitsort': info_Arbeitsort,
        'VerfügbarAb': info_VerfügbarAb,
        'Kompetenzenqualifikation': info_Kompetenzenqualifikation,
        'Aufgabengebiet': info_Aufgabengebiet,
        'Erwartungen': info_Erwartungen,
        'WirBietenIhnen': info_WirBietenIhnen,
        'Publikationsdatum': info_Publikationsdatum}

任何帮助是如此之多的赞赏!

o7jaxewo

o7jaxewo1#

在弗拉斯的一些提示下,我终于成功地让我的代码工作了。如果将来有人遇到同样的问题,也许我下面的代码也能帮助你:

import scrapy
from random import randint
from time import sleep

class WorkpoolJobsSpider(scrapy.Spider):
name = "getdata"
page_number = 2
allowed_domains = ["workpool-jobs.ch"]
start_urls = ["https://www.workpool-jobs.ch/recht-jobs"]

def parse(self, response):
    SET_SELECTOR = "//p[@class='inserattitel h2 mt-0']/a/@href"
    for joboffer in response.xpath(SET_SELECTOR):
        url1 = response.urljoin(joboffer.get())
        yield scrapy.Request(url1, callback = self.parse_dir_contents)

    next_page = "https://www.workpool-jobs.ch/recht-jobs?seite=" + str(WorkpoolJobsSpider.page_number)
    sleep(randint(5,10))
    if WorkpoolJobsSpider.page_number < 27:
        WorkpoolJobsSpider.page_number += 1
        yield response.follow(next_page, callback=self.parse)

def parse_dir_contents(self, response):
    single_info = response.xpath(".//*[@class='col-12 col-md mr-md-3 mr-xl-5']")

    for info in single_info:
        info_Titel = info.xpath(".//article/h1[@class='inserattitel']/text()").extract_first()
        info_Berufsfelder = info.xpath(".//article/div[@class='border-top-grau']/p/text()").extract()
        info_Arbeitspensum = info.xpath(".//article/div[@class='row bg-hellstblau']/div[@class='col-12 col-sm-6 col-lg-5']/dl/dd[1]/text()").extract_first()
        info_Anstellungsverhältnis = info.xpath(".//article/div[@class='row bg-hellstblau']/div[@class='col-12 col-sm-6 col-lg-5']/dl/dd[2]/text()").extract_first()
        info_Arbeitsort = info.xpath(".//article/div[@class='row bg-hellstblau']/div[@class='col-12 col-sm-6 col-lg-5']/dl/dd[4]/a/text()").extract()
        info_VerfügbarAb = info.xpath(".//article/div[@class='row bg-hellstblau']/div[@class='col-12 col-sm-6 col-lg-5']/dl/dd[5]/text()").extract()
        info_Kompetenzenqualifikation = info.xpath(".//article/div[@class='row bg-hellstblau']/div[@class='col-12 col-sm-6 col-lg-7']/dl[2]/dd/text()").extract_first()
        info_Aufgabengebiet = info.xpath(".//article/div[@class='border-bottom-grau'][1]//*[self::p or self::li]").extract()
        info_Erwartungen = info.xpath(".//article/div[@class='border-bottom-grau'][2]/ul/li[descendant-or-self::text()]").extract()
        info_WirBietenIhnen = info.xpath(".//article/div[@class='border-bottom-grau'][3]/ul/li[descendant-or-self::text()]").extract()
        info_Publikationsdatum = info.xpath(".//article/footer[@class='inseratfooter']/p[1]/strong/text()").extract_first()

        yield {'Titel': info_Titel,
        'Berufsfelder': info_Berufsfelder,
        'Arbeitspensum': info_Arbeitspensum,
        'Anstellungsverhältnis': info_Anstellungsverhältnis,
        'Arbeitsort': info_Arbeitsort,
        'VerfügbarAb': info_VerfügbarAb,
        'Kompetenzenqualifikation': info_Kompetenzenqualifikation,
        'Aufgabengebiet': info_Aufgabengebiet,
        'Erwartungen': info_Erwartungen,
        'WirBietenIhnen': info_WirBietenIhnen,
        'Publikationsdatum': info_Publikationsdatum}

相关问题