由于某些原因,当代码中涉及ItemLoader时,它会导致此错误
start_requests = iter(self.spider.start_requests())
TypeError: 'NoneType' object is not iterable
下面分别是GetTikTokMetricsSpider.py
和items.py
的代码。中间值在GetTikTokMetricsSpider.py
之后没有ItemLoader进程的情况下工作,但是没有通过ItemLoader到达def get_medians
。我尝试将包括ItemLoader在内的整个进程放在start_requests
中,它返回了相同的错误。ItemLoader是如何导致这里的错误的?下面是代码。
获取TikTokMetricsSpider.py:
import scrapy
import json
import csv
import os
import pandas as pd
import numexpr as ne
from scrapy.loader import ItemLoader
from ScrapeTikTok.items import MedsItem
from TikTokLocalPaths import get_project_path
class GettiktokmetricsSpider(scrapy.Spider):
name = 'GetTikTokMetricsSpider'
custom_settings = {
"FEEDS": {
"data/metrics.csv": {
"format": "csv",
"overwrite": True
}
},
"FEED_EXPORT_FIELDS": [
"user", "view_med", "like_med", "comment_med", "share_med"
],
}
def start_requests(self):
print("START REQUEST")
users = self.get_users()
print(users)
for user in users:
get_medians = self.get_medians(user)
def get_medians(self, user):
print("GET MEDIANS")
df_counts = self.get_df_counts()
df_counts.query(f"user == '{user}'", inplace=True)
df_counts["view_med"] = df_counts["view_count"].median(axis=0)
df_counts["like_med"] = df_counts["like_count"].median(axis=0)
df_counts["comment_med"] = df_counts["comment_count"].median(axis=0)
df_counts["share_med"] = df_counts["share_count"].median(axis=0)
view_med = df_counts["view_med"].iloc[0]
like_med = df_counts["like_med"].iloc[0]
comment_med = df_counts["comment_med"].iloc[0]
share_med = df_counts["share_med"].iloc[0]
print(user)
print(view_med)
print(like_med)
print(comment_med)
print(share_med)
print(type(view_med))
print(type(like_med))
print(type(comment_med))
print(type(share_med)) #Works til here without below il
il = ItemLoader(item=MedsItem())
il.add_value("user", user)
il.add_value("view_med", view_med)
il.add_value("like_med", like_med)
il.add_value("comment_med", comment_med)
il.add_value("share_med", share_med)
yield il.load_item()
print(MedsItem())
def get_users(self):
counts_url = self.get_csv_counts_url()
df_counts = pd.read_csv(counts_url)
users = df_counts["user"].unique()
return users
def get_df_counts(self):
counts_url = self.get_csv_counts_url()
df_counts = pd.read_csv(counts_url)
return df_counts
def get_csv_counts_url(self):
url = f"{get_project_path()}/data/counts.csv"
return url
个项目.py:
import scrapy
from scrapy.loader import ItemLoader
from itemloaders.processors import Join, MapCompose, TakeFirst, Identity
from w3lib.html import remove_tags
def get_count(view):
view_count = str(view)
if ("Share" or "share"
or "Comment" or "comment"
or "Like" or "like") in view_count:
view_count = "0"
return view_count
if "." in view:
view_count = view_count.replace(".", "")
if "K" == view[-1]:
view_count = view_count.replace("K", "000")
if "M" == view[-1]:
view_count = view_count.replace("M", "000000")
return view_count
def get_med(value):
if type(value) != str:
str_value = str(value)
else:
return value
return str_value
class CountsItem(scrapy.Item):
user = scrapy.Field(input_processor=MapCompose(remove_tags), output_processor = TakeFirst())
view_count = scrapy.Field(input_processor=MapCompose(remove_tags, get_count), output_processor = TakeFirst())
like_count = scrapy.Field(input_processor=MapCompose(remove_tags, get_count), output_processor = TakeFirst())
comment_count = scrapy.Field(input_processor=MapCompose(remove_tags, get_count), output_processor = TakeFirst())
share_count = scrapy.Field(input_processor=MapCompose(remove_tags, get_count), output_processor = TakeFirst())
class MedsItem(scrapy.Item):
user = scrapy.Field(input_processor=MapCompose(get_med), output_processor = TakeFirst())
view_med = scrapy.Field(input_processor=MapCompose(get_med), output_processor = TakeFirst())
like_med = scrapy.Field(input_processor=MapCompose(get_med), output_processor = TakeFirst())
comment_med = scrapy.Field(input_processor=MapCompose(get_med), output_processor = TakeFirst())
share_med = scrapy.Field(input_processor=MapCompose(get_med), output_processor = TakeFirst())
UPDATE SOLVED看起来这个错误是由scrapy
的一般结构引起的,它需要在start_requests
中产生它的Request
。一个简单的解决方案是使用一个随机的url或文件,然后调用callback=parse
。这里我创建了一个空的html文件来最小化传递的数据,但是没有保证scrapy
在将来不会改变,一旦检测到空响应就拒绝并自动终止,在这种情况下,我认为向文件中添加几个文本就可以了,但现在看来这似乎解决了问题:
import scrapy
import os
import pandas as pd
from scrapy.loader import ItemLoader
from ScrapeTikTok.items import MedsItem
from TikTokLocalPaths import get_project_path, get_project_file_path
class GettiktokmetricsSpider(scrapy.Spider):
name = 'GetTikTokMetricsSpider'
custom_settings = {
"FEEDS": {
"data/metrics.csv": {
"format": "csv",
"overwrite": True
}
},
"FEED_EXPORT_FIELDS": [
"user", "view_med", "like_med", "comment_med", "share_med"
],
}
def start_requests(self):
create_empty_html = self.create_empty_html()
empty_html = f"{get_project_file_path()}/data/empty_html.html"
yield scrapy.Request(empty_html, callback=self.parse)
def create_empty_html(self):
empty_html = f"{get_project_path}/data/empty_html.html"
if os.path.isfile(empty_html) == True:
pass
else:
file = open(f"data/empty_html.html", "w", encoding="utf-8")
file.write("")
file.close()
def parse(self, response):
users = self.get_users()
for user in users:
df_counts = self.get_df_counts()
df_counts.query(f"user == '{user}'", inplace=True)
df_counts["view_med"] = df_counts["view_count"].median(axis=0)
df_counts["like_med"] = df_counts["like_count"].median(axis=0)
df_counts["comment_med"] = df_counts["comment_count"].median(axis=0)
df_counts["share_med"] = df_counts["share_count"].median(axis=0)
view_med = df_counts["view_med"].iloc[0]
like_med = df_counts["like_med"].iloc[0]
comment_med = df_counts["comment_med"].iloc[0]
share_med = df_counts["share_med"].iloc[0]
il = ItemLoader(item=MedsItem())
il.add_value("user", user)
il.add_value("view_med", view_med)
il.add_value("like_med", like_med)
il.add_value("comment_med", comment_med)
il.add_value("share_med", share_med)
yield il.load_item()
def get_users(self):
counts_url = self.get_csv_counts_url()
df_counts = pd.read_csv(counts_url)
users = df_counts["user"].unique()
return users
def get_df_counts(self):
counts_url = self.get_csv_counts_url()
df_counts = pd.read_csv(counts_url)
return df_counts
def get_csv_counts_url(self):
url = f"{get_project_path()}/data/counts.csv"
return url
1条答案
按热度按时间tpxzln5u1#
你的
start_requests
根本不返回或生成,所以返回值总是NoneType
。在这一行中,您将该过程移交给
get_medians
方法:然后在
get_medians
方法中生成加载项:因此,该项目将返回给
start_requests
方法,并存储在变量get_medians
中。此时,您应该生成表示该项的
get_medians
变量,而不是开始循环的下一次迭代,并用下一项覆盖get_medians
变量。只需在启动请求中添加一条
yield
语句就可以解决问题。例如:
不幸的是,即使这样也可能导致错误,因为scrapy获取
start_requests
的输出(预期为scrapy.Request
对象),并立即将其发送到调度程序,最终转换为scrapy.Response
对象。由于scrapy需要start_requests中的请求对象,并且需要从
parse
方法中产生项,因此您可以使用任何您能想到的请求来访问parse方法,然后从那里执行parse方法中的代码。例如: