我在搜集罗马,米兰和贝加莫的餐厅评论。每一个城市都有一个包含30家或更多餐厅的专用网址。刮刀开始爬行罗马的餐馆,但从来没有切换到其他城市。它正确地刮所有的餐馆和评论从罗马,但随后蜘蛛关闭。
罗马的餐馆是同时被抓取的,我希望起始网址也有同样的行为,但只考虑第一个
class ReviewSpider2(scrapy.Spider):
name= 'reviews2'
def start_requests(self):
urls = [
'https://www.tripadvisor.it/Restaurants-g187791-Rome_Lazio.html'
'https://www.tripadvisor.it/Restaurants-g187849-Milan_Lombardy.html'
'https://www.tripadvisor.it/Restaurants-g187830-Bergamo_Province_of_Bergamo_Lombardy.html'
]
for url in urls:
yield scrapy.Request(url, callback = self.parse_restaurants)
def parse_restaurants(self, response):
all_restaurants = list(set(response.xpath("//div[contains(@data-test,'_list_item')]//div/div/div/span/a[starts-with(@href,'/Restaurant_Review')]/@href").extract()))
for restaurant in all_restaurants:
url = 'https://www.tripadvisor.it' + restaurant
yield response.follow(url, callback = self.parse_restaurant)
def parse_restaurant(self, response):
all_reviews_containers = response.xpath('//div[@class="rev_wrap ui_columns is-multiline"]/div[2]')
if all_reviews_containers is not None:
for review_container in all_reviews_containers:
items = ReviewscraperItem()
items['restaurant_name'] = response.css('.HjBfq::text').extract_first()
items['rating'] = 0
rating_classes = {
'ui_bubble_rating bubble_50': 5,
'ui_bubble_rating bubble_40': 4,
'ui_bubble_rating bubble_30': 3,
'ui_bubble_rating bubble_20': 2,
'ui_bubble_rating bubble_10': 1
}
rating_class = review_container.css('span::attr(class)').extract_first()
items['rating'] = rating_classes.get(rating_class)
items['quote'] = review_container.css('.noQuotes::text').extract_first()
items['address'] = response.xpath("//span/span/a[@class='AYHFM']/text()").extract_first()
items['review'] = review_container.css('.partial_entry::text').extract_first()
yield items
#check if the next page button is disabled (there are no pages left)
if response.xpath('//a[@class = "nav next ui_button primary disabled"]').extract_first() is None:
next_page = 'https://www.tripadvisor.it' + response.xpath('//a[@class = "nav next ui_button primary"]/@href').extract_first()
yield response.follow(url=next_page, callback = self.parse_restaurant)
1条答案
按热度按时间btxsgosb1#
你少了一些逗号,请看评论: