使用Scrapy从TripAdvisor进行网页抓取时无法提取

8gsdolmq  于 2022-11-09  发布在  其他
关注(0)|答案(1)|浏览(180)

我试着从那些评论巴厘岛酒店的人那里提取所有的评论,这应该是我正在遵循的途径
1.首先定义星星url https://www.tripadvisor.com.pe/Hotels-g294226-Bali-Hotels.html
1.从第一个链接https://www.tripadvisor.com.pe/Hotels-g294226-oa30-Bali-Hotels.html相关的所有酒店的页面刮(每一个页面更改-oaXX-)
1.从酒店https://www.tripadvisor.com.pe/Hotel_Review-g3404940-d1532277-Reviews-Munduk_Moding_Plantation_Nature_Resort_Spa-Gobleg_Banjar_Buleleng_Regency_Bali.html的每个页面中获取每个酒店的评论
1.从每个酒店www.example.com的意见页面上抓取https://www.tripadvisor.com.pe/Hotel_Review-g3404940-d1532277-Reviews-or5-Munduk_Moding_Plantation_Nature_Resort_Spa-Gobleg_Banjar_Buleleng_Regency_Bali.html#REVIEWS(每个页面更改-或X-)5.访问每个用户的个人资料,该用户查看并提取了该用户的所有意见https://www.tripadvisor.com.pe/Profile/AntonioPozo89?fid=6b049a76-cdf0-4534-a276-3b124769df03

from scrapy.item import Field
from scrapy.item import Item
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.loader.processors import MapCompose
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader

class Opinion(Item):
    titulo = Field()
    calificacion = Field()
    contenido = Field()
    autor = Field()

class TripAdvisor(CrawlSpider):
    name = "OpinionesTripAdvisor"
    custom_settings = {
        'USER_AGENT': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36",
        'CLOSESPIDER_PAGECOUNT':100
    }
    download_delay = 1
    allowed_domains = ['tripadvisor.com']
    start_urls = ["https://www.tripadvisor.com.pe/Hotels-g294226-Bali-Hotels.html"]

    rules = (
        #Page by hotels Horixontal
        Rule(
            LinkExtractor(
                allow=r'-oa\d+-'
            ), follow=True
        ),

        #Hotel Details Vertical, restric only the links that are in the names
        Rule(
            LinkExtractor(
                allow=r'/Hotel_Review-',
                restrict_xpaths=['//div[@id="taplc_hsx_hotel_list_lite_dusty_hotels_combined_sponsored_ad_density_control_0"]//a[data-clicksource="HotelName"]']
            ), follow=True
        ),

        #Page by opinions in the below part
        Rule(
            LinkExtractor(
                allow=r'/-or\d+-/'
            ), follow=True

        ),

        #Detail by profile
        Rule(
            LinkExtractor(
                allow=r'/profile/',
                #restrict_xpaths=['//div[@data-test-target="HR_CC_CARD"]//a[contains(@class, "ui_header")'] #a[@class="ui_header_link uyyBf"]
            ), follow=True, callback='parse_opinion'
        )
    )

    #obtain review from tag ui_bubble_rating bubble_50
    def obtenerCalif(self, texto):
        calificacion = texto.split("_")[-1]
        return calificacion

    #parsing the callback, iteration the review
    def parse_opinion(self, response):
        sel = Selector(response)
        opiniones = sel.xpath('//div[@id="content"]/div/div')
        autor = sel.xpath('//h1/text()').get() #get the user by the last page

        for opinion in opiniones:
            item = ItemLoader(Opinion(), opinion)
            item.add_value('autor',autor)
            item.add_xpath('titulo','//div[class="AzIrY b _a VrCoN"]/text()')
            item.add_xpath('contenido', './/q/text()')
            item.add_xpath('calificacion', './/div[@class="muQub VrCoN"]/span/@class',
                           MapCompose(self.obtenerCalif))
            yield item.load_item()

脚本运行,但文件为空,我不确定是什么问题

13z8s7eq

13z8s7eq1#

我在允许的域中做了一个错误,我还添加了一个新的项目来获取配置文件的网址,并检查是否刮取的信息是正确的

from scrapy.item import Field
from scrapy.item import Item
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.loader.processors import MapCompose
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
from scrapy.crawler import CrawlerProcess

class Opinion(Item):
    titulo = Field()
    calificacion = Field()
    contenido = Field()
    autor = Field()
    profilel = Field()

class TripAdvisor(CrawlSpider):
    name = "OpinionesTripAdvisor"
    custom_settings = {
        'USER_AGENT': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36",
        'CLOSESPIDER_PAGECOUNT':35
    }
    download_delay = 1
    allowed_domains = ['tripadvisor.com.pe'] #its necesary set .pe
    start_urls = ["https://www.tripadvisor.com.pe/Hotels-g294226-Bali-Hotels.html"]
    rules = (
        #Page by hotels Horizontal
        Rule(
            LinkExtractor(
                allow=r'-oa\d+'
            ), follow=True
        ),

        #Hotel Details Vertical, restric only the links that are in the names
        Rule(
            LinkExtractor(
                allow=r'/Hotel_Review-',
                # restrict_xpaths=['//div[@id="taplc_hsx_hotel_list_lite_dusty_hotels_combined_sponsored_ad_density_control_0"]//a[data-clicksource="HotelName"]']
            ), follow=True
        ),
        #Page by opinions in the below part
        Rule(
            LinkExtractor(
                allow=r'/-or\d+-/'
            ), follow=True

        ),

        #Detail by profile
        Rule(
            LinkExtractor(
                allow=r'/Profile/',
                #restrict_xpaths=['//div[@data-test-target="HR_CC_CARD"]//a[contains(@class, "ui_header")'] #a[@class="ui_header_link uyyBf"]
            ), follow=True, callback='parse_item'
        )
    )

    #obtain review from tag ui_bubble_rating bubble_50
    def obtenerCalif(self, texto):
        calificacion = texto.split("_")[-1]
        return calificacion

    #parsing the callback, iteration the review
    def parse_item(self, response):
        sel = Selector(response)
        opiniones = sel.xpath('//div[@id="content"]/div/div')
        autor = sel.xpath('//h1/span/text()').get() #get the user by the last page

        for opinion in opiniones:
            item = ItemLoader(Opinion(), opinion)
            item.add_value('autor',autor)
            item.add_xpath('titulo','//div[@class="AzIrY b _a VrCoN"]/text()')
            item.add_xpath('contenido', './/q/text()')
            item.add_xpath('calificacion', './/div[@class="muQub VrCoN"]/span/@class',
                           MapCompose(self.obtenerCalif))
            item.add_xpath('profilel','//link[@rel="canonical"]/@href')

            yield item.load_item()

相关问题