我在spider类中有一个列表。我需要初始化它。代码如下所示:
class Myspider(SitemapSpider):
name = 'spidername'
sitemap_urls = [
'https://www.arabam.com/sitemap/otomobil_13.xml']
sitemap_rules = [
('/otomobil/', 'parse'),
]
custom_settings = {'FEED_FORMAT':'csv','FEED_URI': "arabam_"+str(datetime.today().strftime('%d%m%y'))+'.csv'
}
crawled = []
new_links = 0
def parse(self,response):
if self.new_links >3:
with open("URLs", "wb") as f:
pickle.dump(self.crawled, f)
self.new_links = 0
for td in response.xpath("/html/body/div[3]/div[6]/div[4]/div/div[2]/table/tbody/tr/td[4]/div/a"):
if link[0] not in self.crawled:
self.crawled.append(link[0])
################################某些代码
process = CrawlerProcess({
})
Myspider.crawled = []
Myspider.crawled.append("hi")
try:
with (open("URLs", "rb")) as openfile:
while True:
try:
Myspider.crawled = pickle.load(openfile)
except EOFError:
break
except:
with open("URLs", "wb") as f:
pickle.dump("", f)
print(Myspider.crawled)
process.crawl(Myspider, Myspider.crawled)
process.start() # the script wi
它不断抛出以下异常:
Traceback (most recent call last):
File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\site-packages\twisted\internet\defer.py", line 151, in maybeDeferred
result = f(*args,**kw)
File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\site-packages\pydispatch\robustapply.py", line 55, in robustApply
return receiver(*arguments,**named)
File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\site-packages\scrapy\extensions\feedexport.py", line 262, in item_scraped
slot = self.slot
AttributeError: 'FeedExporter' object has no attribute 'slot'
根据一些资源,这是因为:
Traceback (most recent call last):
File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\site-packages\twisted\internet\defer.py", line 151, in maybeDeferred
result = f(*args,**kw)
File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\site-packages\pydispatch\robustapply.py", line 55, in robustApply
return receiver(*arguments,**named)
File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\site-packages\scrapy\extensions\feedexport.py", line 232, in open_spider
uri = self.urifmt % self._get_uri_params(spider)
File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\site-packages\scrapy\extensions\feedexport.py", line 313, in _get_uri_params
params[k] = getattr(spider, k)
File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\site-packages\scrapy\spiders\__init__.py", line 36, in logger
logger = logging.getLogger(self.name)
File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\logging\__init__.py", line 1845, in getLogger
return Logger.manager.getLogger(name)
File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\logging\__init__.py", line 1174, in getLogger
raise TypeError('A logger name must be a string')
TypeError: A logger name must be a string
我该如何传递列表给它呢?或者有什么方法可以让这个列表只被scrapy spider初始化一次?列表包含了所有已经爬过的url。这个列表是pickle的。当代码开始时,它初始化这个列表,并且只有当链接不在这个列表中时才进一步爬网。
1条答案
按热度按时间yiytaume1#
在您的示例中,需要使用spider属性名称(即
crawled
)来传递url列表。根据文档,如果你不覆盖spider的
__init__
方法,所有传递给spider类的参数都会Map到spider属性。所以为了覆盖crawled
属性,你需要发送extact参数名称。大概是这样的: