Scrapy spider在广泛爬网中未生成所有start_requests url

oipij1gg  于 2023-01-09  发布在  其他
关注(0)|答案(1)|浏览(116)

我正在尝试创建一个scraper,从〉300.000个start_urls抓取主页和一些更深的页面。代码运行时没有重大错误,但仅在31.000个url后停止。零碎的日志显示'finished_reason'='finished'。

    • 我不明白为什么要完成Scrapy,而不是所有来自start_requests的url都被擦除。**

下面的代码显示了我运行的spider。

import scrapy
from scrapy import Request
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
from datetime import datetime
from twisted.internet.error import DNSLookupError, TCPTimedOutError, ConnectionLost
from scrapy.spidermiddlewares.httperror import HttpError
import pandas as pd

# To run: scrapy crawl gptspider -s -o output.json --logfile logfile.txt

class TextSpider(scrapy.Spider):
    name = "gptspider"

    # Settings to optimize for our broad crawl, as recommended by https://docs.scrapy.org/en/latest/topics/broad-crawls.html
    custom_settings = {
        "SCHEDULER_PRIORITY_QUEUE": "scrapy.pqueues.DownloaderAwarePriorityQueue",
        "CONCURRENT_REQUESTS": 100,
        "REACTOR_THREADPOOL_MAXSIZE": 20,
        "USER_AGENT": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
    }

    def __init__(self, *a, **kw):
        super(TextSpider, self).__init__(*a, **kw)
        self.progress_counter = 0
        self.MAX_DEPTH = 0
        self.companies_with_valid_url = []
        self.allowed_domains = []
        urlfilepath = "PATH_TO_CSV_WITH_OVER_100k_ULRS"
        companyurls = pd.read_csv(urlfilepath)
        for i, row in companyurls.iterrows():
            some_property = row["Some_propoerty"]
            url = row["website"]
            if url is not None and type(url) == str:
                base_url = TextSpider.convert_url_to_base(url)
                self.companies_with_valid_url.append(
                    {"some_property": some_property, "base_url": base_url}
                )
                self.allowed_domains.append(urlparse(base_url).netloc)

    # Put URLS into the right format
    @staticmethod
    def convert_url_to_base(url):
        if url is not None and type(url) == str:
            if not (
                url.startswith("//")
                or url.startswith("http://")
                or url.startswith("https://")
            ):
                url = "//" + url
            url = urlparse(url).netloc
            if ("http" in url) & (url[-1:] == "/"):
                return url
            elif ("http" in url) & (url[-1:] != "/"):
                return url + "/"
            elif ("http" not in url) & (url[-1:] == "/"):
                return "http://" + url
            else:
                return "http://" + url + "/"
        return url

    def start_requests(self):
        print(
            f"Starting requests for {len(self.companies_with_valid_url)} URLS"
        )  # Output Starting requests for 320833 companies
        for companyurl in self.companies_with_valid_url:
            yield scrapy.Request(
                url=companyurl["base_url"],
                callback=self.parse,
                errback=self.handle_error,
                dont_filter=True,
                meta={"some_property": companyurl["some_property"]},
            )
        print("THIS NEVER PRINTS")

    # Parse the response, extract the visible text and scrape the subpages
    def parse(self, response):
        if self.progress_counter % 100 == 0:
            print(self.progress_counter)
            print(datetime.now().strftime("%d/%m/%Y %H:%M:%S"))
        self.progress_counter += 1

        # THIS IS NOT YET USED
        if response.meta["depth"] < self.MAX_DEPTH:
            # Get a list of all the sub-pages to scrape
            sub_pages = response.xpath("//a/@href").getall()
            # Yield requests to scrape the sub-pages
            for sub_page in sub_pages:
                yield response.follow(
                    sub_page,
                    callback=self.parse,
                    errback=self.handle_error,
                    dont_filter=True,
                    meta={"some_property": response.request.meta["some_property"]},
                )

        # Yield the scraped text and the URL it came from
        return self.response_to_data(response)

    def handle_error(self, failure):

        if failure.check(HttpError):
            error_status = failure.value.response.status

        elif failure.check(DNSLookupError):
            error_status = "DNSLookupError"

        elif failure.check(TimeoutError, TCPTimedOutError):
            error_status = "TCPTimedOutError"
        elif failure.check(ConnectionLost):
            error_status = "ConnectionLost"
        else:
            error_status = "Other error"

        # add depth variable if not included (can happen under some error circumstances)
        if "depth" not in failure.request.meta:
            failure.request.meta["depth"] = 0

        return {
            "some_property": failure.request.meta["some_property"],
            "url": failure.request.url,
            "depth": failure.request.meta["depth"],
            "response_code": error_status,
            "scraped_at": datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
            "text": failure.getErrorMessage(),
            "failure": 1,
        }

    def response_to_data(self, response):
        return {
            "some_property": response.request.meta["some_property"],
            "url": response.url,
            "depth": response.request.meta["depth"],
            "response_code": response.status,
            "scraped_at": datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
            "text": self.extract_visible_text(response),
            "failure": 0,
        }

    # Extract the visible text from a scrapy HttpResponse
    def extract_visible_text(self, response):
        soup = BeautifulSoup(response.text, "html.parser")

        if response.status == 200 or response.status == 304:
            soup = BeautifulSoup(response.text, "html.parser")
            # Removes all script and style tags
            for script in soup(["style", "script"]):
                script.decompose()

            # If no "body" tag in the html text, the firms will be classified as unlabeled
            if soup.find("body") == None:
                return "NODATA_NO_BODY_TAG"
            else:
                html = soup.get_text()
                html = " ".join(html.split())
                visible_text = re.sub(r"(?<![A-Z])(?<!^)([A-Z])", r" \1", html)
                # visible_text = html
                # Handle surrogates
                visible_text = visible_text.encode("utf-8", "surrogateescape").decode(
                    "utf-8", "replace"
                )
                return visible_text
        else:
            return {"text": "NODATA_BAD_RESPONSE_CODE"}

我尝试调整设置(如Spider代码中所示)以符合Scrapy关于广泛爬行的建议,但没有成功。
我还试图检查最后的日志,看看在程序完成之前发生了什么。我确实注意到在完成之前出现了"用户超时导致连接失败"错误(请参见下面的日志expt),但这些错误发生在日志中,据我所知不应该导致程序完成。

...

Getting http://www.okaidi.be/ took longer than 180.0 seconds..', 'failure': 1}
2023-01-04 12:09:48 [scrapy.core.engine] INFO: Closing spider (finished)
2023-01-04 12:09:48 [scrapy.extensions.feedexport] INFO: Stored json feed (3040 items) in: output11.json
2023-01-04 12:09:48 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
...

编辑:
日志文件的摘要(结束)

2023-01-04 17:52:34 [scrapy.core.engine] INFO: Closing spider (finished)
2023-01-04 17:52:34 [scrapy.extensions.feedexport] INFO: Stored json feed (34754 items) in: output33.json
2023-01-04 17:52:34 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/exception_count': 9531,
 'downloader/exception_type_count/idna.core.InvalidCodepoint': 1,
 'downloader/exception_type_count/scrapy.exceptions.IgnoreRequest': 355,
 'downloader/exception_type_count/twisted.internet.error.ConnectionRefusedError': 49,
 'downloader/exception_type_count/twisted.internet.error.DNSLookupError': 6360,
 'downloader/exception_type_count/twisted.internet.error.TCPTimedOutError': 1139,
 'downloader/exception_type_count/twisted.internet.error.TimeoutError': 20,
 'downloader/exception_type_count/twisted.web._newclient.ResponseFailed': 24,
 'downloader/exception_type_count/twisted.web._newclient.ResponseNeverReceived': 1583,
 'downloader/request_bytes': 33338047,
 'downloader/request_count': 138042,
 'downloader/request_method_count/GET': 138042,
 'downloader/response_bytes': 995846155,
 'downloader/response_count': 128866,
 'downloader/response_status_count/200': 59652,
 'downloader/response_status_count/202': 1,
 'downloader/response_status_count/301': 48480,
 'downloader/response_status_count/302': 7107,
 'downloader/response_status_count/303': 210,
 'downloader/response_status_count/307': 176,
 'downloader/response_status_count/308': 464,
 'downloader/response_status_count/400': 29,
 'downloader/response_status_count/401': 11,
 'downloader/response_status_count/402': 5,
 'downloader/response_status_count/403': 1260,
 'downloader/response_status_count/404': 8688,
 'downloader/response_status_count/405': 1,
 'downloader/response_status_count/406': 2,
 'downloader/response_status_count/409': 2,
 'downloader/response_status_count/410': 56,
 'downloader/response_status_count/418': 2,
 'downloader/response_status_count/429': 156,
 'downloader/response_status_count/456': 6,
 'downloader/response_status_count/500': 1709,
 'downloader/response_status_count/502': 47,
 'downloader/response_status_count/503': 654,
 'downloader/response_status_count/504': 27,
 'downloader/response_status_count/510': 2,
 'downloader/response_status_count/520': 4,
 'downloader/response_status_count/521': 7,
 'downloader/response_status_count/522': 87,
 'downloader/response_status_count/523': 2,
 'downloader/response_status_count/524': 3,
 'downloader/response_status_count/526': 2,
 'downloader/response_status_count/999': 14,
 'elapsed_time_seconds': 2787.423471,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 1, 4, 16, 52, 34, 847868),
 'httpcompression/response_bytes': 3578050995,
 'httpcompression/response_count': 53590,
 'item_scraped_count': 34754,
 'log_count/DEBUG': 684335,
 'log_count/ERROR': 5348,
 'log_count/INFO': 57,
 'log_count/WARNING': 426,
 'request_depth_max': 1,
 'response_received_count': 70239,
 'retry/count': 7823,
 'retry/max_reached': 3814,
 'retry/reason_count/429 Unknown Status': 118,
 'retry/reason_count/500 Internal Server Error': 994,
 'retry/reason_count/502 Bad Gateway': 32,
 'retry/reason_count/503 Service Unavailable': 436,
 'retry/reason_count/504 Gateway Time-out': 19,
 'retry/reason_count/522 Unknown Status': 58,
 'retry/reason_count/524 Unknown Status': 2,
 'retry/reason_count/twisted.internet.error.ConnectionRefusedError': 31,
 'retry/reason_count/twisted.internet.error.DNSLookupError': 4240,
 'retry/reason_count/twisted.internet.error.TCPTimedOutError': 804,
 'retry/reason_count/twisted.internet.error.TimeoutError': 14,
 'retry/reason_count/twisted.web._newclient.ResponseFailed': 16,
 'retry/reason_count/twisted.web._newclient.ResponseNeverReceived': 1059,
 "robotstxt/exception_count/<class 'idna.core.InvalidCodepoint'>": 1,
 "robotstxt/exception_count/<class 'twisted.internet.error.ConnectionRefusedError'>": 9,
 "robotstxt/exception_count/<class 'twisted.internet.error.DNSLookupError'>": 1094,
 "robotstxt/exception_count/<class 'twisted.internet.error.TCPTimedOutError'>": 163,
 "robotstxt/exception_count/<class 'twisted.internet.error.TimeoutError'>": 3,
 "robotstxt/exception_count/<class 'twisted.web._newclient.ResponseFailed'>": 2,
 "robotstxt/exception_count/<class 'twisted.web._newclient.ResponseNeverReceived'>": 261,
 'robotstxt/forbidden': 355,
 'robotstxt/request_count': 38926,
 'robotstxt/response_count': 37332,
 'robotstxt/response_status_count/200': 28193,
 'robotstxt/response_status_count/400': 16,
 'robotstxt/response_status_count/401': 6,
 'robotstxt/response_status_count/402': 3,
 'robotstxt/response_status_count/403': 618,
 'robotstxt/response_status_count/404': 8084,
 'robotstxt/response_status_count/405': 1,
 'robotstxt/response_status_count/406': 1,
 'robotstxt/response_status_count/409': 1,
 'robotstxt/response_status_count/410': 28,
 'robotstxt/response_status_count/418': 1,
 'robotstxt/response_status_count/429': 19,
 'robotstxt/response_status_count/500': 242,
 'robotstxt/response_status_count/502': 6,
 'robotstxt/response_status_count/503': 86,
 'robotstxt/response_status_count/504': 2,
 'robotstxt/response_status_count/510': 1,
 'robotstxt/response_status_count/520': 2,
 'robotstxt/response_status_count/521': 3,
 'robotstxt/response_status_count/522': 13,
 'robotstxt/response_status_count/523': 1,
 'robotstxt/response_status_count/526': 1,
 'robotstxt/response_status_count/999': 4,
 'scheduler/dequeued': 72075,
 'scheduler/dequeued/memory': 72075,
 'scheduler/enqueued': 72075,
 'scheduler/enqueued/memory': 72075,
 'spider_exceptions/ValueError': 1,
 'start_time': datetime.datetime(2023, 1, 4, 16, 6, 7, 424397)}
2023-01-04 17:52:34 [scrapy.core.engine] INFO: Spider closed (finished)
yb3bgrhw

yb3bgrhw1#

我听从了@granitosaurus对Scrapy, limit on start_url的建议,并按照建议修改了代码,以批量抓取。这似乎已经工作了。程序现在抓取所有的url。

相关问题