python 500内部服务器错误和引发错误,ReactorNotRestartable()扭曲,internet,错误,ReactorNotRestartable在碎片中

s4n0splo  于 2023-02-15  发布在  Python
关注(0)|答案(1)|浏览(3819)

我是新的在scrappy和运行此代码时得到这个错误没有任何数据:
500内部服务器错误引发错误。ReactorNotRestartable()扭曲。internet。错误。ReactorNotRestartable在 scrapy 中
我也使用了CrawlerRunner而不是CrawlerProcess,但再次出现错误
这个代码从一些网址获取数据...每个数据都有一个特定的id,对于每个网址,代码应该停止在特定的id,并开始从下一个网址获取数据...

import logging
from builtins import Exception
from typing import Generator, Optional
import json
import scrapy
from scrapy import Spider
from scrapy.crawler import CrawlerProcess
from scrapy.http.response import Response
import redis
from scrapy.exceptions import CloseSpider
import sys

class TestSpider(Spider):
    name = "test"
    channel_info = None
    channel_username = ""
    start_urls = []
    msg_id = ""

    def parse(self, response: Response, current_page: Optional[int] = None) -> Generator:

        if not self.channel_info:
            self.channel_info = self.parse_channel_data(response)
        articles = response.css('.ee_widget_message_wrap')
        for article in articles:
            try:
                result = self.parse_article(article)
                if result is None:
                    return
                yield result
            except Exception as e:
                continue

        if prev := response.xpath("//link[contains(@rel, 'prev')]")[0].attrib['href']:
            yield response.follow(
                prev,
                callback=self.parse,
                meta={
                    "playwright": True,
                    "playwright_include_page": True,
                    "playwright_context": f"page-{current_page}",
                },
            )

    @staticmethod
    def parse_channel_data(response: Response) -> dict:
        channel_info = {}
        channel_data_selector = response.xpath(".//div[contains(@class, 'ee_channel_info')]")
        channel_info['title'] = channel_data_selector.xpath(
            ".//div[contains(@class, 'ee_channel_info_header_title')]//text()"
        ).getall()[1]
        channel_info['username'] = channel_data_selector.xpath(
            ".//div[contains(@class, 'ee_channel_info_header_username')]//text()"
        ).getall()[0]
        channel_count = channel_data_selector.xpath(".//span[contains(@class, 'counter_value')]//text()").getall()[:4]
        channel_info['participants_count'] = channel_count[0]
        channel_info['pictures_count'] = channel_count[1]
        channel_info['videos_count'] = channel_count[2]
        channel_info['files_count'] = channel_count[3]
        print_name = channel_data_selector.xpath(
            ".//div[contains(@class, 'ee_channel_info_description')]//text()"
        ).getall()
        channel_info['print_name'] = ''.join(print_name)
        channel_info['url'] = channel_data_selector.xpath(".//a[contains(@class, 'ee_channel_download_aa')]")[
            0
        ].attrib['href']
        return channel_info

    def parse_article(self, response: Response):
        article_id = response.attrib['id']
        channel = self.channel_info['username']
        chnl = f"@{self.channel_username}"
        if chnl == self.channel_info['username']:
            stop_id = self.msg_id
            chnl= ""
        if int(article_id) == int(stop_id):
            CloseSpider("cancelled")
            return
        if int(article_id) <= int(stop_id):
            texts =response.xpath("normalize-space(.//div[contains(@class, 'ee_widget_message_text')]//text())").getall()
            texts = ''.join(texts)
            view = response.xpath(".//span[contains(@class, 'ee_widget_message_views')]//text()")[0].get()
            publish_datetime = response.xpath(".//time[contains(@class, 'time')]")[0].attrib['datetime']
            article = {
                    'id': article_id,
                    'text': texts,
                    'view': view,
                    'date': publish_datetime,
                    'from': self.channel_info['username'],
                }
            return article


process = CrawlerProcess(
    settings={

        "AUTOTHROTTLE_ENABLED": True,
        "AUTOTHROTTLE_START_DELAY": 1,
        "AUTOTHROTTLE_MAX_DELAY": 10,  # config Download Delay based on the network
        "DUPEFILTER_CLASS": "scrapy_redis.dupefilter.RFPDupeFilter",
        "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
        "DOWNLOAD_HANDLERS": {
            "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
        },
        "CONCURRENT_REQUESTS": 1,
        "PLAYWRIGHT_MAX_PAGES_PER_CONTEXT": 10,
        "FEEDS": {
            "articles.json": {"format": "json", "encoding": "utf-8", "indent": 4},
        },
        "RETRY_TIMES": 10,
        "RETRY_HTTP_CODES": [503, 504, 400, 403, 404, 408, 429],
        "DOWNLOADER_MIDDLEWARES": {
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,

        },
        "PROXY_LIST": 'proxies.txt',
        "PROXY_MODE": 0,
        "ITEM_PIPELINES": {
            'scrapy_redis.pipelines.RedisPipeline': 300,
            'scrapy.pipelines.files.FilesPipeline': 1,
        },
        'FILES_STORE': '/tmp/images/',
    }
)

channels = [
    ['username1', "219"],
    ['username2', "133"],
    ['username3', "106"],
    ['username4',"45"],

]

domain_ = "url"
for channel in range(len(channels)):
    TestSpider.start_urls = [f"{domain_}/{channels[channel][0]}"]
    TestSpider.channel_username = channels[channel][0]
    TestSpider.msg_id = channels[channel][1]
    process.crawl(TestSpider)
    logging.getLogger("scrapy.core.engine").setLevel(logging.WARNING)
    logging.getLogger("scrapy.core.scraper").setLevel(logging.WARNING)
    process.start()
y1aodyip

y1aodyip1#

问题是您试图使用不同的值重新运行同一个spider。CrawlerProcess的工作方式是初始化进程,然后需要调度所有计划在进程中运行的spider,然后运行process.start()
在代码中,在for循环的每次迭代结束时都会调用crawler.start(),但当for循环到达第二次调用时,React器已经关闭,无法重新启动。
因此,您面临的问题是,您希望对所有不同的通道使用相同的spider类,因此,一个可能的解决方案是在返回class对象的函数作用域中定义TestSpider,因此,每次调用该函数时,它将返回相同的TestSpider类,但类对象是唯一的。因此您可以为每个变量分配不同唯一start_urls集。
然后,您可以在for循环中使用CrawlerProcess调度所有不同的TestSpider类对象,然后在for循环完成后运行process.start
例如,它可能看起来像这样:

import logging
from builtins import Exception
from typing import Generator, Optional
import json
import scrapy
from scrapy import Spider
from scrapy.crawler import CrawlerProcess
from scrapy.http.response import Response
import redis
from scrapy.exceptions import CloseSpider
import sys

def gen_spider_class():   # enclose TestSpider inside function

    class TestSpider(Spider):
        name = "test"
        channel_info = None
        channel_username = ""
        start_urls = []
        msg_id = ""

        def parse(self, response: Response, current_page: Optional[int] = None) -> Generator:

            if not self.channel_info:
                self.channel_info = self.parse_channel_data(response)
            articles = response.css('.ee_widget_message_wrap')
            for article in articles:
                try:
                    result = self.parse_article(article)
                    if result is None:
                        return
                    yield result
                except Exception as e:
                    continue

            if prev := response.xpath("//link[contains(@rel, 'prev')]")[0].attrib['href']:
                yield response.follow(
                    prev,
                    callback=self.parse,
                    meta={
                        "playwright": True,
                        "playwright_include_page": True,
                        "playwright_context": f"page-{current_page}",
                    },
                )

        @staticmethod
        def parse_channel_data(response: Response) -> dict:
            channel_info = {}
            channel_data_selector = response.xpath(".//div[contains(@class, 'ee_channel_info')]")
            channel_info['title'] = channel_data_selector.xpath(
                ".//div[contains(@class, 'ee_channel_info_header_title')]//text()"
            ).getall()[1]
            channel_info['username'] = channel_data_selector.xpath(
                ".//div[contains(@class, 'ee_channel_info_header_username')]//text()"
            ).getall()[0]
            channel_count = channel_data_selector.xpath(".//span[contains(@class, 'counter_value')]//text()").getall()[:4]
            channel_info['participants_count'] = channel_count[0]
            channel_info['pictures_count'] = channel_count[1]
            channel_info['videos_count'] = channel_count[2]
            channel_info['files_count'] = channel_count[3]
            print_name = channel_data_selector.xpath(
                ".//div[contains(@class, 'ee_channel_info_description')]//text()"
            ).getall()
            channel_info['print_name'] = ''.join(print_name)
            channel_info['url'] = channel_data_selector.xpath(".//a[contains(@class, 'ee_channel_download_aa')]")[
                0
            ].attrib['href']
            return channel_info

        def parse_article(self, response: Response):
            article_id = response.attrib['id']
            channel = self.channel_info['username']
            chnl = f"@{self.channel_username}"
            if chnl == self.channel_info['username']:
                stop_id = self.msg_id
                chnl= ""
            if int(article_id) == int(stop_id):
                return
            if int(article_id) <= int(stop_id):
                texts =response.xpath("normalize-space(.//div[contains(@class, 'ee_widget_message_text')]//text())").getall()
                texts = ''.join(texts)
                view = response.xpath(".//span[contains(@class, 'ee_widget_message_views')]//text()")[0].get()
                publish_datetime = response.xpath(".//time[contains(@class, 'time')]")[0].attrib['datetime']
                article = {
                        'id': article_id,
                        'text': texts,
                        'view': view,
                        'date': publish_datetime,
                        'from': self.channel_info['username'],
                    }
                return article

    return TestSpider   # Return the TestSpider class



def main():
    process = CrawlerProcess(
        settings={
            "AUTOTHROTTLE_ENABLED": True,
            "AUTOTHROTTLE_START_DELAY": 1,
            "AUTOTHROTTLE_MAX_DELAY": 10,  # config Download Delay based on the network
            "DUPEFILTER_CLASS": "scrapy_redis.dupefilter.RFPDupeFilter",
            "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
            "DOWNLOAD_HANDLERS": {
                "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
             },
            "CONCURRENT_REQUESTS": 1,
            "PLAYWRIGHT_MAX_PAGES_PER_CONTEXT": 10,
            "FEEDS": {
                "articles.json": {
                    "format": "json", 
                    "encoding": "utf-8", 
                    "indent": 4
                },
             },
             "RETRY_TIMES": 10,
             "RETRY_HTTP_CODES": [503, 504, 400, 403, 404, 408, 429],
             "DOWNLOADER_MIDDLEWARES": {
                'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
              },
             "PROXY_LIST": 'proxies.txt',
             "PROXY_MODE": 0,
             "ITEM_PIPELINES": {
                'scrapy_redis.pipelines.RedisPipeline': 300,
                'scrapy.pipelines.files.FilesPipeline': 1,
             },
             'FILES_STORE': '/tmp/images/',
           })

    channels = [
        ['username1', "219"],
        ['username2', "133"],
        ['username3', "106"],
        ['username4',"45"],
     ]
    domain_ = "url"
    logging.getLogger("scrapy.core.engine").setLevel(logging.WARNING)
    logging.getLogger("scrapy.core.scraper").setLevel(logging.WARNING)
    for channel in range(len(channels)):
         test_spider_class = gen_spider_class() #get unique Spider class
         test_spider_class.start_urls = [f"{domain_}/{channels[channel][0]}"]
         test_spider_class.channel_username = channels[channel][0]
         test_spider_class.msg_id = channels[channel][1]
         process.crawl(test_spider_class)  # schedule the spider    
     process.start() # now this is only called once and all the spiders                 have already been scheduled.

if __name__ == "__main__":
    main()

相关问题