scrapy 未调用python连接信号

x6492ojm  于 2022-11-09  发布在  Python
关注(0)|答案(1)|浏览(148)

我有下面的文件和代码

import logging
from scrapy import signals
from scrapy.exceptions import NotConfigured

logger = logging.getLogger(__name__)

class SpiderOpenCloseLogging:

    def __init__(self, item_count):
        self.item_count = item_count
        self.items_scraped = 0

    @classmethod
    def from_crawler(cls, crawler):

        print('Hey I am called')
        # first check if the extension should be enabled and raise
        # NotConfigured otherwise
        # if not crawler.settings.getbool('MYEXT_ENABLED'):
        #     raise NotConfigured

        # get the number of items from settings
        item_count = 1000 #crawler.settings.getint('MYEXT_ITEMCOUNT', 1000)

        # instantiate the extension object
        ext = cls(crawler.settings,crawler.stats)

        # connect the extension object to signals
        crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
        crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
        crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped)

        # return the extension object
        return ext

    def spider_opened(self, spider):
        logger.info("opened spider %s", spider.name)

    def spider_closed(self, spider):
        logger.info("closed spider %s", spider.name)

    def item_scraped(self, item, spider):
        self.items_scraped += 1
        if self.items_scraped % self.item_count == 0:
            logger.info("scraped %d items", self.items_scraped)

我已经更改了设置

MYEXT_ENABLED = True 
EXTENSIONS = {
     'project.custom_extension.SpiderOpenCloseLogging': 300
}

但是没有信号被调用,我已经检查了设置中给出的路径,蜘蛛被调用
未记录我提供的打印事件
有人能告诉我我错过了什么吗
谢谢

cl25kdpy

cl25kdpy1#

所有的信号都是从我对你的脚本的改编中调用的。你犯了一些错误,对我来说没有任何意义,因为你没有具体说明任何东西。这就是为什么你没有得到信号,而是得到错误:
几个错误:
1.

def __init__(self, item_count, stats):
        self.item_count = item_count
        #self.items_scraped = 0 --- change this
        self.items_scraped = stats
def item_scraped(self, item, spider):
        # self.items_scraped += 1 --- You could do this but then you would not need `crawler.stats`
        # if self.items_scraped % self.item_count == 0: --- these should be the other way around
            logger.info("scraped %d items", self.items_scraped)

# additional note;

# --- you did not substantiate self.item_count, putting item_count

 #in from_crawler does not work. Because you are returning ext, so
 #self.item_count takes crawler.settings rather than item_count. So
 #you will get an error.

i.通过更新,我们进行了以下更正:

def __init__(self, item_count, stats): # if you want to include crawler.stats
        self.item_count = item_count
        self.items_scraped = stats

二.

def spider_opened(self, spider):
        self.items_scraped = self.items_scraped.get_value('item_scraped_count') #use crawler.stats to get item_count
        if self.items_scraped is None:
            self.items_scraped = 0 #then instantiate with 0
        self.item_count = self.item_count.getint('MYEXT_ITEMCOUNT', 1000) #get you item count from settings
        print(f'TEST: {self.items_scraped}, COUNT:{self.item_count}')
        logger.info("opened spider %s", spider.name)

三.

def item_scraped(self, item, spider):
        logger.info(f"scraped few {self.items_scraped} items")
        self.items_scraped += 1
        if  self.item_count % self.items_scraped == 0: # these have been flipped
            logger.info(f"scraped increments {self.items_scraped} items")

以下是一个完整的例子:

import logging
from scrapy import signals
import scrapy

logger = logging.getLogger(__name__)

class SpiderOpenCloseLogging(scrapy.Spider):

    name = 'log_signals'

    start_urls =  [f'http://quotes.toscrape.com/page/{i}/' for i in range(1, 11)]

    def __init__(self, item_count, stats):
        self.item_count = item_count
        self.items_scraped = stats
        #self.items_scraped = 0

    @classmethod
    def from_crawler(cls, crawler):
        ext = cls(crawler.settings,crawler.stats)

        crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
        crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
        crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped)

        return ext

    def spider_opened(self, spider):
        self.items_scraped = self.items_scraped.get_value('item_scraped_count')
        if self.items_scraped is None:
            self.items_scraped = 0
        self.item_count = self.item_count.getint('MYEXT_ITEMCOUNT', 1000)
        print(f'TEST: {self.items_scraped}, COUNT:{self.item_count}')
        logger.info("opened spider %s", spider.name)

    def spider_closed(self, spider):
        logger.info("closed spider %s", spider.name)

    def item_scraped(self, item, spider):
        logger.info(f"scraped few {self.items_scraped} items")
        self.items_scraped += 1
        if  self.item_count % self.items_scraped == 0:
            #print(f"scraped increments {self.items_scraped} items")
            logger.info(f"scraped increments {self.items_scraped} items")

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(
                url=url,
                callback=self.parse
            )
    def parse(self, response):
        content = response.xpath('//div[@class = "row"]//div')
        for items in content:
            yield {
                'some_items_links':items.xpath(".//a//@href").get()
            }

输出量:

.
.
.
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 194 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/9/>
{'some_items_links': '/author/C-S-Lewis'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 195 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/9/>
{'some_items_links': '/tag/christianity/page/1/'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 196 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/9/>
{'some_items_links': '/tag/love/'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 197 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/10/>
{'some_items_links': '/author/J-K-Rowling'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 198 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/10/>
{'some_items_links': '/author/J-K-Rowling'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 199 items
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped increments 200 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/10/>
{'some_items_links': '/tag/truth/page/1/'}
...

相关问题