Scrapy Playwright点击并循环浏览虚拟Javascript页面

wkftcu5l  于 2022-11-09  发布在  Java
关注(0)|答案(1)|浏览(396)

我正在开发一个可以获取光学细节的scrapy机器人。我需要点击一个javascript按钮来显示一个虚拟页面,这样我的scrapy机器人就可以抓取光学细节。
这就是我需要剧作家点击显示在一个红色的矩形。Details tab highlighted in red
在某些页面上,已显示第一个项目详细信息页面。例如:
Virtual page details tab open
我可能需要为这个创建一些If else语句?我想做这个,但是我一直在处理前面的问题。

import scrapy

class UpperSpider(scrapy.Spider):
    name = 'Optic'
    start_urls = [
        'https://www.brownells.com/optics-mounting/scopes/rifle-scopes/strike-eagle-1-6x24mm-rifle-scope-prod135114.aspx']

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url=url,
                                    meta={'playwright': True})

    # Issue Here is I'm not sure if this is working I want to click on the Details Tab
    def virtualpage(self, response, page):
        # Virtual Page button
        vpButton = response.css('div[id="wrap"]')
        for page in vpButton:
            page.click('#detailTab')

    # Also some pages for instance https://www.brownells.com/optics-mounting/electronic-sights/red-dot-sights/carbine-optic-aco--prod73112.aspx
    # Already have there virtual pages showing. I think I would need a if .. statement to make sure it didn't close the page.

    def parse(self, response):
        container = response.css('div[id="wrap"]')
        for products in container:
            yield {
                'ProductName': products.css('span[itemprop="name"]::text').get(),
                'Stock': products.css('span[itemprop="availability"]::text').get(),
                'Brand': response.css('#listMain .wrap .mbm a::text').get(),
                'Name': response.css('#listMain span+ span::text').get(),
                'Price': products.css('#priceContainer > span > p > span::text').get(),
                'Image': products.css('#lnkImgSku img::attr(src)').get(),
                'Battery': products.css('section:nth-child(1) p:contains("Battery")::text').get(),
                'Length': products.css('section:nth-child(1) p:contains("Length")::text').get(),
                'Weight': products.css('section:nth-child(1) p:contains("Weight")::text').get(),
                'URL': response.url,
                'Reticle': products.css('#detailWrap p:contains("Reticle")::text').get()
            }

好吧,所以我试着让scrapy爬虫工作。我很确定我知道问题出在我的start_request中的网址self.start_urls:我相信它是在告诉剧作家从Start_URLS开始。我如何告诉剧作家也从每个爬取的页面开始?这样“Clickallbtns”就可以运行了?

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_playwright.page import PageCoroutine

class UpperSpider(CrawlSpider):
    name = 'Upper'
    allowed_domains = ['brownells.com']
    start_urls = ['https://www.brownells.com/optics-mounting/electronic-sights/red-dot-sights/index.htm']

    le_item_details = LinkExtractor(restrict_css='.listing')

    rule_product_detail = Rule(le_item_details,
                                callback='parse_item',
                                follow=True,
                                )
    rules = (
        rule_product_detail,
    )

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url,
                                    meta={'playwright': True,
                                        'playwright_page_coroutines': {
                                            #"waitforload": PageCoroutine("waitforNavagation", 'url'),
                                            "clickallbtns": PageCoroutine("evaluate", 'document.querySelectorAll("#detailTab").forEach(x=>x.click())'),
                                        }
                                        }
                                    )

    def parse_item(self, response):
        container = response.css('div[id="wrap"]')
        for products in container:
            yield {
                'ProductName': products.css('span[itemprop="name"]::text').get(),
                'Stock': products.css('span[itemprop="availability"]::text').get(),
                'Brand': response.css('#listMain .wrap .mbm a::text').get(),
                'Name': response.css('#listMain span+ span::text').get(),
                'Price': products.css('#priceContainer > span > p > span::text').get(),
                'Image': products.css('#lnkImgSku img::attr(src)').get(),
                'Battery': products.css('section:nth-child(1) p:contains("Battery")::text').get(),
                'Length': products.css('section:nth-child(1) p:contains("Length")::text').get(),
                'Weight': products.css('section:nth-child(1) p:contains("Weight")::text').get(),
                'URL': response.url,
                'Reticle': products.css('#detailWrap p:contains("Reticle")::text').get()
                }
alen0pnh

alen0pnh1#

您需要在playwrightPageCoroutines字典中包含单击逻辑,以便在返回响应之前单击按钮。
请看下面的示例代码。如果你在settings.py中定义scrapy-playwright值,那么你可以注解掉custom_settings变量。否则,如果你从脚本运行它,下面的代码就足够了(使用scrapy 2.6.1)。

import scrapy
from scrapy_playwright.page import PageCoroutine

class UpperSpider(scrapy.Spider):
    name = 'Optic'
    custom_settings = dict(
        DOWNLOAD_HANDLERS={
            "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
        },
        TWISTED_REACTOR="twisted.internet.asyncioreactor.AsyncioSelectorReactor",
    )

    start_urls = [
        'https://www.brownells.com/optics-mounting/scopes/rifle-scopes/strike-eagle-1-6x24mm-rifle-scope-prod135114.aspx']

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url=url,
                        meta={'playwright': True,
                            'playwright_page_coroutines': {
                                "clickallbtns": PageCoroutine("evaluate", 'document.querySelectorAll("#detailTab").forEach(x=>x.click())'), 
                                }
                            }
                    )

    def parse(self, response):
        container = response.css('div[id="wrap"]')
        for products in container:
            yield {
                'ProductName': products.css('span[itemprop="name"]::text').get(),
                'Stock': products.css('span[itemprop="availability"]::text').get(),
                'Brand': response.css('#listMain .wrap .mbm a::text').get(),
                'Name': response.css('#listMain span+ span::text').get(),
                'Price': products.css('#priceContainer > span > p > span::text').get(),
                'Image': products.css('#lnkImgSku img::attr(src)').get(),
                'Battery': products.css('section:nth-child(1) p:contains("Battery")::text').get(),
                'Length': products.css('section:nth-child(1) p:contains("Length")::text').get(),
                'Weight': products.css('section:nth-child(1) p:contains("Weight")::text').get(),
                'URL': response.url,
                'Reticle': products.css('#detailWrap p:contains("Reticle")::text').get()
            }

相关问题