我试着从一个基于react的网站获取一些数据,但是当我使用CrawlSpider时,我无法解析其他页面。例如,我可以解析我的第一个带有splash的URL,其他URL将定期解析,不包含动态内容。
这是我的代码:
class PageSpider(CrawlSpider):
host = 'hooshmandsazeh.com'
protocol = 'https'
root_domain = 'hooshmandsazeh.com'
name = 'page'
allowed_domains = [host]
#start_urls = [f'{protocol}://{host}',]
def start_requests(self):
url = f'{self.protocol}://{self.host}'
yield SplashRequest(url, dont_process_response=True, args={'wait': 1}, meta={'real_url': url})
custom_settings = {
#'DEPTH_LIMIT': 9,
}
rules = (
# Rule(LinkExtractor(allow=('node_\d+\.htm',)), follow=True),
Rule(LinkExtractor(allow=(host),deny=('\.webp', '\.js', '\.css', '\.jpg', '\.png'),unique=True),
callback='parse',
follow=True,
process_request='splash_request'
),
)
def splash_request(self, request):
request.meta['real_url'] = request.url
print("Aliii",request.meta['real_url'])
return request
def _requests_to_follow(self, response):
if not isinstance(response, HtmlResponse):
return
seen = set()
newresponse = response.replace(url=response.meta.get('real_url'))
for n, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(newresponse)
if lnk not in seen]
if links and rule.process_links:
links = rule.process_links(links)
for link in links:
seen.add(link)
r = self._build_request(n, link)
yield rule.process_request(r)
def parse(self,response):
if len(LinkExtractor(deny = self.host).extract_links(response)) > 0:
loader = ItemLoader(item=PageLevelItem(), response=response)
loader.add_value('page_source_url', response.url)
yield loader.load_item()
1条答案
按热度按时间r6l8ljro1#
检查以下代码对我有效: