python Scrapy Splash使用CrawlSpider进行动态刮除

lx0bsm1f  于 2023-02-15  发布在  Python
关注(0)|答案(1)|浏览(141)

我试着从一个基于react的网站获取一些数据,但是当我使用CrawlSpider时,我无法解析其他页面。例如,我可以解析我的第一个带有splash的URL,其他URL将定期解析,不包含动态内容。
这是我的代码:

class PageSpider(CrawlSpider):
  host = 'hooshmandsazeh.com'
  protocol = 'https'
  root_domain = 'hooshmandsazeh.com'
  name = 'page'
  allowed_domains = [host]
  #start_urls = [f'{protocol}://{host}',]

  def start_requests(self):
        url = f'{self.protocol}://{self.host}'
        yield SplashRequest(url, dont_process_response=True, args={'wait':  1}, meta={'real_url': url})

  custom_settings = {
    #'DEPTH_LIMIT': 9,
    }

  rules = (
        # Rule(LinkExtractor(allow=('node_\d+\.htm',)), follow=True),
        Rule(LinkExtractor(allow=(host),deny=('\.webp', '\.js', '\.css', '\.jpg', '\.png'),unique=True),
                              callback='parse',
                              follow=True,
                              process_request='splash_request'
                              ),
    )

  def splash_request(self, request):
      request.meta['real_url'] = request.url
      print("Aliii",request.meta['real_url'])
      return request

  def _requests_to_follow(self, response):
      
      if not isinstance(response, HtmlResponse):
          return
      seen = set()
      newresponse = response.replace(url=response.meta.get('real_url'))
      for n, rule in enumerate(self._rules):
          links = [lnk for lnk in rule.link_extractor.extract_links(newresponse)
                  if lnk not in seen]
          
          if links and rule.process_links:
              links = rule.process_links(links)
          for link in links:
              seen.add(link)
              r = self._build_request(n, link)
              yield rule.process_request(r)
      

  def parse(self,response):
    if len(LinkExtractor(deny = self.host).extract_links(response)) > 0:
      loader = ItemLoader(item=PageLevelItem(), response=response)
      loader.add_value('page_source_url', response.url)
      yield loader.load_item()
r6l8ljro

r6l8ljro1#

检查以下代码对我有效:

def splash_request(self, request):
          # request = request.replace(url=RENDER_HTML_URL + request.url)
          request.meta['real_url'] = request.url
          return SplashRequest(request.meta['real_url'], dont_process_response=True, args={'wait':  0}, meta={'real_url': request.meta['real_url']})

相关问题