scrapy 项目加载器导致start_requests())类型错误:“NoneType”对象不可迭代

lf5gs5x2  于 2022-11-09  发布在  其他
关注(0)|答案(1)|浏览(191)

由于某些原因,当代码中涉及ItemLoader时,它会导致此错误

start_requests = iter(self.spider.start_requests())
TypeError: 'NoneType' object is not iterable

下面分别是GetTikTokMetricsSpider.pyitems.py的代码。中间值在GetTikTokMetricsSpider.py之后没有ItemLoader进程的情况下工作,但是没有通过ItemLoader到达def get_medians。我尝试将包括ItemLoader在内的整个进程放在start_requests中,它返回了相同的错误。ItemLoader是如何导致这里的错误的?下面是代码。

获取TikTokMetricsSpider.py:

import scrapy
import json
import csv
import os
import pandas as pd
import numexpr as ne
from scrapy.loader import ItemLoader
from ScrapeTikTok.items import MedsItem
from TikTokLocalPaths import get_project_path

class GettiktokmetricsSpider(scrapy.Spider):
    name = 'GetTikTokMetricsSpider'
    custom_settings = {
        "FEEDS": {
            "data/metrics.csv": {
                "format": "csv",
                "overwrite": True
            }
        },
        "FEED_EXPORT_FIELDS": [
            "user", "view_med", "like_med", "comment_med", "share_med"
        ],
    }

    def start_requests(self):
        print("START REQUEST")
        users = self.get_users()
        print(users)
        for user in users:
            get_medians = self.get_medians(user)

    def get_medians(self, user):
        print("GET MEDIANS")
        df_counts = self.get_df_counts()
        df_counts.query(f"user == '{user}'", inplace=True)

        df_counts["view_med"] = df_counts["view_count"].median(axis=0)
        df_counts["like_med"] = df_counts["like_count"].median(axis=0)
        df_counts["comment_med"] = df_counts["comment_count"].median(axis=0)
        df_counts["share_med"] = df_counts["share_count"].median(axis=0)

        view_med = df_counts["view_med"].iloc[0]
        like_med = df_counts["like_med"].iloc[0]
        comment_med = df_counts["comment_med"].iloc[0]
        share_med = df_counts["share_med"].iloc[0]

        print(user)
        print(view_med)
        print(like_med)
        print(comment_med)
        print(share_med)

        print(type(view_med))
        print(type(like_med))
        print(type(comment_med))
        print(type(share_med)) #Works til here without below il

        il = ItemLoader(item=MedsItem())
        il.add_value("user", user)
        il.add_value("view_med", view_med)
        il.add_value("like_med", like_med)
        il.add_value("comment_med", comment_med)
        il.add_value("share_med", share_med)
        yield il.load_item()
        print(MedsItem())

    def get_users(self):
        counts_url = self.get_csv_counts_url()
        df_counts = pd.read_csv(counts_url)
        users = df_counts["user"].unique()
        return users

    def get_df_counts(self):
        counts_url = self.get_csv_counts_url()
        df_counts = pd.read_csv(counts_url)
        return df_counts

    def get_csv_counts_url(self):
        url = f"{get_project_path()}/data/counts.csv"
        return url

个项目.py:

import scrapy
from scrapy.loader import ItemLoader
from itemloaders.processors import Join, MapCompose, TakeFirst, Identity
from w3lib.html import remove_tags

def get_count(view):
    view_count = str(view)
    if ("Share" or "share"
        or "Comment" or "comment"
        or "Like" or "like") in view_count:
        view_count = "0"
        return view_count
    if "." in view:
        view_count = view_count.replace(".", "")
    if "K" == view[-1]:
        view_count = view_count.replace("K", "000")
    if "M" == view[-1]:
        view_count = view_count.replace("M", "000000")
    return view_count

def get_med(value):
    if type(value) != str:
        str_value = str(value)
    else:
        return value
    return str_value

class CountsItem(scrapy.Item):
    user = scrapy.Field(input_processor=MapCompose(remove_tags), output_processor = TakeFirst())
    view_count = scrapy.Field(input_processor=MapCompose(remove_tags, get_count), output_processor = TakeFirst())
    like_count = scrapy.Field(input_processor=MapCompose(remove_tags, get_count), output_processor = TakeFirst())
    comment_count = scrapy.Field(input_processor=MapCompose(remove_tags, get_count), output_processor = TakeFirst())
    share_count = scrapy.Field(input_processor=MapCompose(remove_tags, get_count), output_processor = TakeFirst())

class MedsItem(scrapy.Item):
    user = scrapy.Field(input_processor=MapCompose(get_med), output_processor = TakeFirst())
    view_med = scrapy.Field(input_processor=MapCompose(get_med), output_processor = TakeFirst())
    like_med = scrapy.Field(input_processor=MapCompose(get_med), output_processor = TakeFirst())
    comment_med = scrapy.Field(input_processor=MapCompose(get_med), output_processor = TakeFirst())
    share_med = scrapy.Field(input_processor=MapCompose(get_med), output_processor = TakeFirst())

UPDATE SOLVED看起来这个错误是由scrapy的一般结构引起的,它需要在start_requests中产生它的Request。一个简单的解决方案是使用一个随机的url或文件,然后调用callback=parse。这里我创建了一个空的html文件来最小化传递的数据,但是没有保证scrapy在将来不会改变,一旦检测到空响应就拒绝并自动终止,在这种情况下,我认为向文件中添加几个文本就可以了,但现在看来这似乎解决了问题:

import scrapy
import os
import pandas as pd
from scrapy.loader import ItemLoader
from ScrapeTikTok.items import MedsItem
from TikTokLocalPaths import get_project_path, get_project_file_path

class GettiktokmetricsSpider(scrapy.Spider):
    name = 'GetTikTokMetricsSpider'
    custom_settings = {
        "FEEDS": {
            "data/metrics.csv": {
                "format": "csv",
                "overwrite": True
            }
        },
        "FEED_EXPORT_FIELDS": [
            "user", "view_med", "like_med", "comment_med", "share_med"
        ],
    }

    def start_requests(self):
        create_empty_html = self.create_empty_html()
        empty_html = f"{get_project_file_path()}/data/empty_html.html"
        yield scrapy.Request(empty_html, callback=self.parse)

    def create_empty_html(self):
        empty_html = f"{get_project_path}/data/empty_html.html"
        if os.path.isfile(empty_html) == True:
            pass
        else:
            file = open(f"data/empty_html.html", "w", encoding="utf-8")
            file.write("")
            file.close()

    def parse(self, response):
        users = self.get_users()
        for user in users:
            df_counts = self.get_df_counts()
            df_counts.query(f"user == '{user}'", inplace=True)

            df_counts["view_med"] = df_counts["view_count"].median(axis=0)
            df_counts["like_med"] = df_counts["like_count"].median(axis=0)
            df_counts["comment_med"] = df_counts["comment_count"].median(axis=0)
            df_counts["share_med"] = df_counts["share_count"].median(axis=0)

            view_med = df_counts["view_med"].iloc[0]
            like_med = df_counts["like_med"].iloc[0]
            comment_med = df_counts["comment_med"].iloc[0]
            share_med = df_counts["share_med"].iloc[0]

            il = ItemLoader(item=MedsItem())
            il.add_value("user", user)
            il.add_value("view_med", view_med)
            il.add_value("like_med", like_med)
            il.add_value("comment_med", comment_med)
            il.add_value("share_med", share_med)
            yield il.load_item()

    def get_users(self):
        counts_url = self.get_csv_counts_url()
        df_counts = pd.read_csv(counts_url)
        users = df_counts["user"].unique()
        return users

    def get_df_counts(self):
        counts_url = self.get_csv_counts_url()
        df_counts = pd.read_csv(counts_url)
        return df_counts

    def get_csv_counts_url(self):
        url = f"{get_project_path()}/data/counts.csv"
        return url
tpxzln5u

tpxzln5u1#

你的start_requests根本不返回或生成,所以返回值总是NoneType
在这一行中,您将该过程移交给get_medians方法:

for user in users:
    get_medians = self.get_medians(user)

然后在get_medians方法中生成加载项:

yield il.load_item()
print(MedsItem())

因此,该项目将返回给start_requests方法,并存储在变量get_medians中。
此时,您应该生成表示该项的get_medians变量,而不是开始循环的下一次迭代,并用下一项覆盖get_medians变量。
只需在启动请求中添加一条yield语句就可以解决问题。
例如:

for user in users:
    get_medians = self.get_medians(user)
    yield get_medians

不幸的是,即使这样也可能导致错误,因为scrapy获取start_requests的输出(预期为scrapy.Request对象),并立即将其发送到调度程序,最终转换为scrapy.Response对象。
由于scrapy需要start_requests中的请求对象,并且需要从parse方法中产生项,因此您可以使用任何您能想到的请求来访问parse方法,然后从那里执行parse方法中的代码。
例如:

def start_requests(self):
        # you can use any url that will successfully create a response
        # object. this one should work though
        yield scrapy.Request(url="https://quotes.toscrape.com")

    def parse(self, response):
        print("START REQUEST")
        users = self.get_users()
        print(users)
        for user in users:
            get_medians = self.get_medians(user)
            yield get_medians

    ...
    ...

相关问题