Scrapy Spider:无法执行回调

yvfmudvl  于 12个月前  发布在  其他
关注(0)|答案(1)|浏览(140)

我在试着刮github repo
我想在每个repo的level1中提取所有XML文件URL,并且在最好的情况下也从XML文件中提取信息。

import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor

repo_rule = Rule(
    LinkExtractor(
        restrict_xpaths="//a[@itemprop='name codeRepository']",
        restrict_text=r"ELTeC-.+"
    )
)

pagination_rule = Rule(
    LinkExtractor(restrict_xpaths="//a[@class='next_page']")
)

level_rule = Rule(
    LinkExtractor(allow=r"/level1"),
    follow=True,
    callback="parse_level"
)

class ELTecSpider(CrawlSpider):
    """Scrapy CrawlSpider for crawling the ELTec repo."""

    name = "eltec"
    start_urls = ["https://github.com/orgs/COST-ELTeC/repositories"]

    rules = [
        repo_rule,
        pagination_rule,
        level_rule,
    ]

    def parse_level(self, response):
        print("INFO: ", response.url)


process = CrawlerProcess(
    settings={
        "FEEDS": {
            "items.json": {
                "format": "json",
                "overwrite": True
            },
        },
    }
)

process.crawl(ELTecSpider)
process.start()

字符串
上面提取了所有level1文件夹的响应,但不知何故,我在这一点上卡住了。我的计划是使用回调函数来访问每个level1 URL,如下所示:

def parse_level(self, response):
    yield scrapy.Request(response.url, callback=self.parse_docs)

def parse_docs(self, response):
    docs_urls = response.xpath("//a[@class='Link--primary']")

    for url in docs_urls:
        print("INFO: ", url)


但很明显回调函数从来没有触发过。
我做错了什么?

rwqw0loc

rwqw0loc1#

scrapy记住访问过的页面,并跳过再次抓取相同的url
这样就不会浪费时间再次获取相同的页面,而且还可以防止爬行循环。
当你运行scrapy.Request(response.url, ...)时,你试图再次抓取相同的urlscrapy跳过了它。
如果你真的需要刮同一页再次那么你可能需要

Request(..., dont_filter=True)

字符串
(Doc:scrapy.http.Request)
我宁愿直接跑

yield self.parse_docs(response)


parse_doc()内部还有一个问题。xpath找不到任何元素-所以for-loop不会运行任何print()。您应该在parse_doc()的开头添加额外的print()以查看它是何时执行的。
xpath可能找不到class='Link--primary',因为此页面使用JavaScript添加元素。这可能需要使用Selenium和模块scrapy-selenium来控制可以运行JavaScript的真实的Web浏览器。scrapy也有Splashscrapy-splashJavaScript一起工作。
(Doc:Selecting dynamically-loaded content
也许GitHub有一些API可以在不刮取的情况下获取信息。

编辑:

完整的工作代码,使用scrapy-seleniumSelenium 3
它不适用于Selenium 4,因为scrapy-selenium自2020年以来没有更新,并且它不适用于最新的Selenium 4

pip install scrapy-selenium
pip install 'selenium<4'
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor

from scrapy_selenium import SeleniumRequest
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

#from shutil import which   # to run `which('firefox')` or `which('chrome')` in settings

repo_rule = Rule(
    LinkExtractor(
        restrict_xpaths="//a[@itemprop='name codeRepository']",
        restrict_text=r"ELTeC-.+"
    )
)

pagination_rule = Rule(
    LinkExtractor(restrict_xpaths="//a[@class='next_page']")
)

level_rule = Rule(
    LinkExtractor(allow=r"/level1"),
    follow=True,
    callback="parse_level"
)

class ELTecSpider(CrawlSpider):
    """Scrapy CrawlSpider for crawling the ELTec repo."""

    name = "eltec"
    start_urls = [
        "https://github.com/orgs/COST-ELTeC/repositories", 
        "https://github.com/COST-ELTeC/ELTeC-lit/tree/master/level1"
    ]

    rules = [
        repo_rule,
        pagination_rule,
        level_rule,
    ]

    def parse_level(self, response):
        print("\n>>> PARSE LEVEL:", response.url)
        #yield scrapy.Request(response.url, callback=self.parse_docs, dont_filter=True)
        yield SeleniumRequest(url=response.url, callback=self.parse_docs, dont_filter=True, 
                wait_time=10,
                #wait_until=EC.element_to_be_clickable((By.CLASS_NAME, 'Link--primary'))
                wait_until=EC.presence_of_element_located((By.CLASS_NAME, 'Link--primary'))
                )
        
    def parse_docs(self, response):
        print("\n>>> PARSE DOC:", response.url)
        
        docs_urls = response.selector.xpath("//a[@class='Link--primary']")
        #print("\n>>> LEN:", len(docs_urls))
        
        for url in docs_urls:
            text = url.xpath('.//text()').get()
            href = url.xpath('.//@href').get()
            #print("\n>>> INFO:", href, text)
            yield {"text": text, "url": href}
            
            
process = CrawlerProcess(
    settings={
        "FEEDS": {
            "items.json": {
                "format": "json",
                "overwrite": True
            },
        },
        
        'SELENIUM_DRIVER_NAME': 'firefox',  # or 'chrome'
        'SELENIUM_DRIVER_EXECUTABLE_PATH': '/home/furas/bin/geckodriver',  # or which('geckodriver'), which('chromedrive')
        'SELENIUM_DRIVER_ARGUMENTS': ['-headless'],  # '--headless' if using `chrome` instead of `firefox`
        #'SELENIUM_DRIVER_ARGUMENTS': [],  # needs at least empty list

        'DOWNLOADER_MIDDLEWARES': {'scrapy_selenium.SeleniumMiddleware': 800},    
    }   
)

process.crawl(ELTecSpider)
process.start()

的字符串
来源:items.json

[
{"text": "LIT00001_seinius_kuprelis.xml", "url": "/COST-ELTeC/ELTeC-lit/blob/master/level1/LIT00001_seinius_kuprelis.xml"},
{"text": "LIT00001_seinius_kuprelis.xml", "url": "/COST-ELTeC/ELTeC-lit/blob/master/level1/LIT00001_seinius_kuprelis.xml"},
{"text": "LIT00002_pietaris_algimantas.xml", "url": "/COST-ELTeC/ELTeC-lit/blob/master/level1/LIT00002_pietaris_algimantas.xml"},
{"text": "LIT00002_pietaris_algimantas.xml", "url": "/COST-ELTeC/ELTeC-lit/blob/master/level1/LIT00002_pietaris_algimantas.xml"},
{"text": "LIT00004_dobilas_bludas.xml", "url": "/COST-ELTeC/ELTeC-lit/blob/master/level1/LIT00004_dobilas_bludas.xml"},
{"text": "LIT00004_dobilas_bludas.xml", "url": "/COST-ELTeC/ELTeC-lit/blob/master/level1/LIT00004_dobilas_bludas.xml"},
{"text": "LIT00005_daukantas_zemaiciu_pasakos.xml", "url": "/COST-ELTeC/ELTeC-lit/blob/master/level1/LIT00005_daukantas_zemaiciu_pasakos.xml"},
{"text": "LIT00005_daukantas_zemaiciu_pasakos.xml", "url": "/COST-ELTeC/ELTeC-lit/blob/master/level1/LIT00005_daukantas_zemaiciu_pasakos.xml"},
{"text": "LIT00006_kudirka_virsininkai.xml", "url": "/COST-ELTeC/ELTeC-lit/blob/master/level1/LIT00006_kudirka_virsininkai.xml"},

相关问题