碎 selenium :为什么分页不适用于scrapy-selenium?

mqxuamgl  于 2022-11-09  发布在  其他
关注(0)|答案(1)|浏览(160)

我尝试使用scrapy-selenium获取数据,但是分页有一些问题。我已经尽了最大的努力使用不同的选择器和方法,但是没有任何变化。它只能刮第一页。我也检查了其他的解决方案,但是我仍然无法使它工作。期待Maven的建议。

来源:https://www.gumtree.com/property-for-sale/london

import scrapy
from urllib.parse import urljoin
from scrapy_selenium import SeleniumRequest
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from shutil import which
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

class Basic2Spider(scrapy.Spider):
    name = 'basic2'

    def start_requests(self):
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
        driver.set_window_size(1920, 1080)

        driver.get("https://www.gumtree.com/property-for-sale/london")
        time.sleep(2)

        property_xpath = driver.find_elements(By.XPATH, "(//article[@class='listing-maxi']/a)[position()>=2 and position()<30]")

        for detail in property_xpath:
            href= detail.get_attribute('href')
            time.sleep(2)
            yield SeleniumRequest(
            url = href,
            )

        driver.quit()
        return super().start_requests()

    def parse(self, response):
        yield {
            'Title': response.xpath("//div[@class='css-w50tn5 e1pt9h6u11']/h1/text()").get(),
            'Price': response.xpath("//h3[@itemprop='price']/text()").get(),
            'Add Posted': response.xpath("//*[@id='content']/div[1]/div/main/div[5]/section/div[1]/dl[1]/dd/text()").get(),
            'Links': response.url
            }

        next_page = response.xpath("//li[@class='pagination-currentpage']/following-sibling::li[1]/a/text()").get()
        if next_page:
            abs_url = f'https://www.gumtree.com/property-for-sale/london/page{next_page}'
            yield SeleniumRequest(
                url= abs_url,
                wait_time=5,
                callback=self.parse
        )
rdrgkggo

rdrgkggo1#

你的代码似乎是正确的,但得到tcp ip块。我也尝试了替代的方式,其中代码是正确的,分页是工作,这种类型的分页是两倍快于其他,但给我有时奇怪的结果,有时得到ip块。

import scrapy
from scrapy import Selector
from scrapy_selenium import SeleniumRequest

class Basic2Spider(scrapy.Spider):
    name = 'basic2'

    responses = []

    def start_requests(self):
        url='https://www.gumtree.com/property-for-sale/london/page{page}'
        for page in range(1,6):
            print(page)
            yield SeleniumRequest(
                url=url.format(page=page),
                callback=self.parse,
                 wait_time=5
                )

    def parse(self, response):

        driver = response.meta['driver']
        intial_page = driver.page_source
        self.responses.append(intial_page)

        for resp in self.responses:
            r = Selector(text=resp)

            property_xpath = r.xpath("(//article[@class='listing-maxi']/a)[position()>=2 and position()<30]")

            for detail in property_xpath:

                yield {
                    'Title': detail.xpath('.//*[@class="listing-title"]/text()').get().strip(),
                    'Price': detail.xpath('.//*[@class="listing-price"]/strong/text()').get(),
                    'Add Posted': detail.xpath('.//*[@class="listing-posted-date txt-sub"]/span//text()').getall()[2].strip(),
                    'Links': response.url
                    }

相关问题