Scrapy -仅抓取url列表中的第一个url

sczxawaw  于 2023-10-20  发布在  其他
关注(0)|答案(1)|浏览(120)

我在搜集罗马,米兰和贝加莫的餐厅评论。每一个城市都有一个包含30家或更多餐厅的专用网址。刮刀开始爬行罗马的餐馆,但从来没有切换到其他城市。它正确地刮所有的餐馆和评论从罗马,但随后蜘蛛关闭。
罗马的餐馆是同时被抓取的,我希望起始网址也有同样的行为,但只考虑第一个

class ReviewSpider2(scrapy.Spider):

    name= 'reviews2'

    def start_requests(self):
        urls = [
        'https://www.tripadvisor.it/Restaurants-g187791-Rome_Lazio.html'
        'https://www.tripadvisor.it/Restaurants-g187849-Milan_Lombardy.html'
        'https://www.tripadvisor.it/Restaurants-g187830-Bergamo_Province_of_Bergamo_Lombardy.html'
        ]
        for url in urls:
            yield scrapy.Request(url, callback = self.parse_restaurants)
        
    def parse_restaurants(self, response):    
        all_restaurants = list(set(response.xpath("//div[contains(@data-test,'_list_item')]//div/div/div/span/a[starts-with(@href,'/Restaurant_Review')]/@href").extract()))
        for restaurant in all_restaurants:
            url = 'https://www.tripadvisor.it' + restaurant
            yield response.follow(url, callback = self.parse_restaurant)
def parse_restaurant(self, response):

        all_reviews_containers = response.xpath('//div[@class="rev_wrap ui_columns is-multiline"]/div[2]')
        if all_reviews_containers is not None:
            for review_container in all_reviews_containers:
                items = ReviewscraperItem()
                items['restaurant_name'] = response.css('.HjBfq::text').extract_first()
                items['rating'] = 0
                rating_classes = {
                    'ui_bubble_rating bubble_50': 5,
                    'ui_bubble_rating bubble_40': 4,
                    'ui_bubble_rating bubble_30': 3,
                    'ui_bubble_rating bubble_20': 2,
                    'ui_bubble_rating bubble_10': 1
                }
                rating_class = review_container.css('span::attr(class)').extract_first()
                items['rating'] = rating_classes.get(rating_class)
                items['quote'] = review_container.css('.noQuotes::text').extract_first()
                items['address'] = response.xpath("//span/span/a[@class='AYHFM']/text()").extract_first()
                items['review'] = review_container.css('.partial_entry::text').extract_first()
                yield items
            #check if the next page button is disabled (there are no pages left)
            if response.xpath('//a[@class = "nav next ui_button primary disabled"]').extract_first() is None:
                next_page = 'https://www.tripadvisor.it' + response.xpath('//a[@class = "nav next ui_button primary"]/@href').extract_first()
                yield response.follow(url=next_page, callback = self.parse_restaurant)
btxsgosb

btxsgosb1#

你少了一些逗号,请看评论:

import scrapy

class ReviewSpider2(scrapy.Spider):
    name = 'reviews2'
    allowed_domains = ['tripadvisor.it']
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-US,en;q=0.5",
        "Cache-Control": "no-cache",
        "Connection": "keep-alive",
        "DNT": "1",
        "Host": "www.tripadvisor.it",
        "Pragma": "no-cache",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-User": "?1",
        "TE": "trailers",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
    }

    def start_requests(self):
        # missing commas:
        # urls = [
        #     'https://www.tripadvisor.it/Restaurants-g187791-Rome_Lazio.html'
        #     'https://www.tripadvisor.it/Restaurants-g187849-Milan_Lombardy.html'
        #     'https://www.tripadvisor.it/Restaurants-g187830-Bergamo_Province_of_Bergamo_Lombardy.html'
        # ]
        urls = [
            'https://www.tripadvisor.it/Restaurants-g187791-Rome_Lazio.html',
            'https://www.tripadvisor.it/Restaurants-g187849-Milan_Lombardy.html',
            'https://www.tripadvisor.it/Restaurants-g187830-Bergamo_Province_of_Bergamo_Lombardy.html'
        ]
        # example_list = ['1' '2' '3'] = ['123']
        for url in urls:
            # use headers
            yield scrapy.Request(url, callback=self.parse_restaurants, headers=self.headers)

    def parse_restaurants(self, response):
        # unnecessary because Scrapy has a built in duplicate filter:
        # all_restaurants = list(set(response.xpath("//div[contains(@data-test,'_list_item')]//div/div/div/span/a[starts-with(@href,'/Restaurant_Review')]/@href").extract()))
        all_restaurants = response.xpath("//div[contains(@data-test,'_list_item')]//div/div/div/span/a[starts-with(@href,'/Restaurant_Review')]/@href").getall()

        for restaurant in all_restaurants:
            url = 'https://www.tripadvisor.it' + restaurant
            print(url)
            # yield response.follow(url, callback = self.parse_restaurant)

相关问题