我无法使用Scrapy移至下一页

sy5wg1nm  于 2022-11-23  发布在  其他
关注(0)|答案(1)|浏览(206)

我正在尝试在reclameaqui站点(www.reclameaqui.com.br)上进行网页抓取。代码已经完成,但我无法在lista-reclamacoes/?pagina=1、lista-reclamacoes/?pagina=2、lista-reclamacoes/?pagina=3上进行迭代并获取其内容:

'''
This spider file contains the spider logic and scraping code. 
In order to determine what needs to go in this file, we have to inspect the website!
'''

import scrapy
from scrapy.spiders import CrawlSpider

from complaintscraper.items.ComplaintItem import ComplaintItem
from complaintscraper.utils.DataCleaning import DataCleaning

import json
import pydash

class ComplaintScraper(CrawlSpider):

  name = "ComplaintScraper"  
  custom_settings = {
    'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
    'CONCURRENT_REQUESTS': 10,
    'CONCURRENT_REQUESTS_PER_DOMAIN': 10,
    'DOWNLOAD_DELAY': 1
  }
  
  start_urls = [
    "https://www.reclameaqui.com.br/empresa/santander/lista-reclamacoes/?pagina=1",
    "https://www.reclameaqui.com.br/empresa/santander/lista-reclamacoes/?pagina=2",
    "https://www.reclameaqui.com.br/empresa/santander/lista-reclamacoes/?pagina=3"
  ]

  def start_requests(self):
    for url in self.start_urls:
      yield scrapy.Request(url, self.parse_complaint, dont_filter=True)
  
      
  def parse_complaint(self, response):
    for row in response.xpath('//div[contains(@class,"bJdtis")]'):
      link = row.xpath("./a/@href").get()
      yield scrapy.Request(response.urljoin(link), callback=self.parse_model_complaint, dont_filter=True)

  def get_data(self, data, query):
    return pydash.get(data, query, None)

  def parse_model_complaint(self, response):

    complaintItem = ComplaintItem()

    data = json.loads(response.xpath('//*[@id="__NEXT_DATA__"]//text()').extract()[0])
    
    complaintItem['id']               = self.get_data(data, "props.pageProps.complaint.legacyId")
    complaintItem['title']            = self.get_data(data, "props.pageProps.complaint.title")
    complaintItem['solved']           = self.get_data(data, "props.pageProps.complaint.solved")
    complaintItem['description']      = self.get_data(data, "props.pageProps.complaint.description")
    complaintItem['url']              = response.url

    content_container                 = response.xpath('//*[contains(@data-testid, "complaint-content-container")]//text()').extract()
    complaintItem['tags']             = content_container[content_container.index("ID:") + 2 : content_container.index("Status da reclamação:")]
    complaintItem['status']           = response.xpath('//*[contains(@data-testid, "complaint-status")]//text()').extract()
    
    complaintItem['userCity']         = self.get_data(data, "props.pageProps.complaint.userCity")
    complaintItem['userState']        = self.get_data(data, "props.pageProps.complaint.userState")
    complaintItem['creation_date']    = self.get_data(data, "props.pageProps.complaint.created")
    
    #Interactions contain customer and company replicas.
    complaintItem['deal_again']       = self.get_data(data, "props.pageProps.complaint.dealAgain")
    complaintItem['score']            = self.get_data(data, "props.pageProps.complaint.score")
      
    yield complaintItem

我得到的是lista-reclamacoes/?pagina=1的内容的三倍(reppetead/similar)。它应该得到lista-reclamacoes/?pagina=1、lista-reclamacoes/?pagina=2、lista-reclamacoes/?pagina=3的所有内容)。

我哪里错了?

zf2sa74q

zf2sa74q1#

您可以尝试下一个示例

import scrapy
from ..items import ComplaintItem
import json

class ComplaintScraper(scrapy.Spider):
    name = "complaintScraper"  

    start_urls = [f"https://iosearch.reclameaqui.com.br/raichu-io-site-search-v1/query/companyComplains/10/{item}?company=98" for item in range(0,100,10)]

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url, self.parse, dont_filter=True)
  
    def parse(self, response):

        complaintItem = ComplaintItem()

        data = json.loads(response.text)
      
        for card in data['complainResult']['complains']['data']:
            complaintItem = ComplaintItem()
            complaintItem['id']               = card['id']
            complaintItem['title']            = card['title']   
            complaintItem['description']      = card['description']
            complaintItem['url']              = response.url
            yield complaintItem

相关问题