如何使用scrapy与爬虫模板和scrapy-splash来解析javascript

byqmnocz  于 2022-11-09  发布在  Java
关注(0)|答案(1)|浏览(146)

我尝试用scrapy抓取amazon的产品,但是我发现amazon使用一些javascript来获取一些产品细节,所以我决定用splash来渲染javascript,它在shell命令中工作得很好,但是我不知道如何在我的代码中实现它。

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

class AmazonCrawlerSpider(CrawlSpider):
    name = 'amazon_Crawler'
    allowed_domains = ['amazon.com']
    start_urls = ['https://www.amazon.com/s?i=specialty-aps&bbn=16225009011&rh=n%3A%2116225009011%2Cn%3A502394&ref=nav_em__nav_desktop_sa_intl_camera_and_photo_0_2_5_3']

    len_product_details = LinkExtractor(restrict_css='h2 > a')
    product_details = Rule(len_product_details,
                           callback='parse_item', follow=False)

    len_products_pagination = LinkExtractor(
        restrict_xpaths='//*[@id="search"]/div[1]/div[1]/div/span[3]/div[2]/div[37]/div/div/span/a[3]')
    products_pagination = Rule(len_products_pagination, follow=True)
    rules = (
        product_details, products_pagination
    )

    def parse_item(self, response):

        data = {

            "categorie_0": response.xpath('//*[@id="wayfinding-breadcrumbs_feature_div"]/ul/li[1]/span/a/text()').get(),
            "categorie_1": response.xpath('//*[@id="wayfinding-breadcrumbs_feature_div"]/ul/li[3]/span/a/text()').get(),
            "title": response.css('h1 > span ::text').get(),
            "price": response.xpath('//div[@id="corePrice_feature_div"]/div/span/span[1]//text()').get(),
            "amazon_globale_shiping": response.xpath('//*[@id="a-popover-content-2"]/table/tbody/tr[2]/td[3]/span/text()').get(),
            "estimated_import_fees_deposit": response.xpath('//*[@id="a-popover-content-2"]/table/tbody/tr[3]/td[3]/span/text()').get(),
            "total": response.xpath('//*[@id="a-popover-content-2"]/table/tbody/tr[5]/td[3]/span/text()').get(),
            "delevery_period": response.xpath('//*[@id="mir-layout-DELIVERY_BLOCK-slot-PRIMARY_DELIVERY_MESSAGE_LARGE"]/span/span/text()').get(),
            "delevery_destination": response.xpath('//*[@id="contextualIngressPtLabel_deliveryShortLine"]/span[2]/text()').get(),
            "in_stock": response.xpath('//*[@id="availability"]/span/text()').get(),
            "quantity": "not_exist",
            "ship_from": response.xpath('//*[@id="tabular-buybox"]/div[1]/div[2]/div/span/text()').get(),
            "sold_by": {
                "name": response.xpath('//*[@id="sellerProfileTriggerId"]/text()').get(),
                'store_url': response.xpath('//*[@id="sellerProfileTriggerId"]/@href').get(),
                'packaging': response.xpath('//*[@id="tabular-buybox"]/div[1]/div[6]/div/span/text()').get()
            },
            "description": response.xpath('//*[@id="productDescription"]/p/text()').get(),
            # "brand": response.xpath('//*[@id="productOverview_feature_div"]/div/table/tbody/tr[1]/td[2]/span/text()').get(),
            "is_returned": response.xpath('//*[@id="productSupportAndReturnPolicy-return-policy-popover-celWidget"]/div/div[1]/text()').get(),
            "extra_info": [],
            "details": [],
            "about_this_item": [],
            "note": response.xpath('//*[@id="universal-product-alert"]/div/span[2]/text()').get(),
            "Q_AW": [],
            "Customer_reviews": {
                "customer_rate": response.xpath('//*[@id="reviewsMedley"]/div/div[1]/div[2]/div[1]/div/div[2]/div/span/span/text()').get(),
                "total_rate": response.xpath('//*[@id="reviewsMedley"]/div/div[1]/div[2]/div[2]/span/text()').get(),
                "global_rate": {
                    "1_star": response.xpath('//*[@id="histogramTable"]/tbody/tr[5]/td[3]/span[2]/a/text()').get(),
                    "2_star": response.xpath('//*[@id="histogramTable"]/tbody/tr[4]/td[3]/span[2]/a/text()').get(),
                    "3_star": response.xpath('//*[@id="histogramTable"]/tbody/tr[3]/td[3]/span[2]/a/text()').get(),
                    "4_star": response.xpath('//*[@id="histogramTable"]/tbody/tr[2]/td[3]/span[2]/a/text()').get(),
                    "5_star": response.xpath('//*[@id="histogramTable"]/tbody/tr[1]/td[3]/span[2]/a/text()').get(),
                },
                "rate_by_feature": [],
                "product_reviews": []

            },
            "url": response.url

        }
        for reveiw in response.xpath('//*[@id="cm-cr-dp-review-list"]/div'):
            data["Customer_reviews"]["product_reviews"].append(
                {
                    "rate": reveiw.xpath('/div/div/div[2]/a/i/span/text()').get(),
                    "feature": reveiw.xpath('div/div/div[2]/a[2]/span/text()').get(),
                    "date_from": reveiw.xpath('div/div/span/text()').get(),
                    "verified": reveiw.xpath('div/div/div[3]/span[2]/text()').get(),
                    "review": reveiw.xpath('div/div/div[4]/span/div/div[1]/span/text()').get(),
                    'view_reaction': reveiw.xpath('div/div/div[5]/span[1]/div[1]/span/text()').get()
                }
            )

        for cr_rf in response.xpath('//*[@id="cr-summarization-attributes-list"]/div'):
            data["Customer_reviews"]["rate_by_feature"].append(
                {
                    "key": cr_rf.xpath('div/div/div/div/span/text()').get(),
                    "value": response.xpath('div/div/div[2]/span[2]/text()').get()
                }
            )

        for Q_AW in response.xpath('//*[@id="ask-btf-container"]/div/div/div[2]/span/div/div'):
            data["Q_AW"].append(
                {
                    "Question": Q_AW.xpath('div/div[2]/div/div/div[2]/a/span/text()').get(),
                    "Answer":  Q_AW.xpath('div/div[2]/div[2]/div/div[2]/span/span[2]/text()').get(),
                    "vote": Q_AW.xpath('div/div/ul/li[2]/span[1]/text()').get(),
                    "date_answer": Q_AW.xpath('div/div[2]/div[2]/div/div[2]/span[3]/text()').get()
                }
            )

        for extra_info in response.xpath('//*[@id="productDetails_detailBullets_sections1"]/tbody/tr'):
            data["extra_info"].append(
                {
                    "1": extra_info.css('th::text').get(),
                    "2": extra_info.css('td::text').get()
                }
            )
        for index, about_this_item in enumerate(response.xpath('//*[@id="feature-bullets"]/ul/li')):
            data["about_this_item"].append(
                {
                    index+1: about_this_item.xpath('span/text()').get(),

                }
            )
        for extra in response.xpath('//*[@id="productOverview_feature_div"]/div/table/tbody/tr'):
            data['details'].append(
                {
                    extra.xpath('td[1]/span/text()').get(): extra.css('td[2]/span/text()').get()
                }
            )

        yield data
lf5gs5x2

lf5gs5x21#

我认为你在第20行有问题,你忘记定义正确的函数了,知道前面的循环是未定义的brojola!!

相关问题