我想在Visual Studio代码中按F5运行“_SPIDER_RUNNER.py”。一切似乎都正常,日志记录显示正在检索项目,但输出JSON文件没有保存到文件夹C:\scrapy\JSON_output。该文件夹存在。我有书面申请。
我完全卡住了,因为没有错误记录。
我在file _singlepage_nonAJAX.py中尝试了不同的路径:
'FEED_URI': 'C:/scrapy/JSON_output/test.json'
'FEED_URI': r'C:\scrapy\JSON_output\test.json'
'FEED_URI': f'C:\\scrapy\\JSON_output\\{self.name}.json'
我尝试从settings.py删除项目_管道和FEED_EXPORT_FIELDS设置
我的文件夹结构如下:
- C:\scrapy\my_spiders\___SPIDER_RUNNER.py
- C:\scrapy\my_spiders\__init__.py
- C:\scrapy\my_spiders\spiders\__init__.py
- C:\scrapy\my_spiders\spiders\_singlepage_nonAJAX.py
所有“init.py”文件都不包含任何代码。
_SPIDER_RUNNER.py
import sys
sys.path.append('C:\\scrapy')
from scrapy.crawler import CrawlerProcess
from my_spiders.spiders._singlepage_nonAJAX import SinglePageNonAJAXSpider
import logging
logging.basicConfig(level=logging.DEBUG)
def run_spider(myname, start_urls, SERP_item, url, itemstatus, okstatus, title):
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process.crawl(SinglePageNonAJAXSpider,
myname=myname,
start_urls=start_urls,
SERP_item=SERP_item,
url=url,
itemstatus=itemstatus,
okstatus=okstatus,
title=title)
process.start()
run_spider("toscrape",
"https://quotes.toscrape.com",
"//div[@class='quote']/span/a[starts-with(@href, '/author/')]",
"./@href",
""''"",
"",
'//span[contains(@class, "author-born-date")]/text()')
_singlepage_nonAJAX.py
import json
import re
import os
import scrapy
import time
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
from lxml import html
class RentalItem(scrapy.Item):
city = scrapy.Field()
url = scrapy.Field()
class SinglePageNonAJAXSpider(scrapy.Spider):
name = 'whatever'
def __init__(self, myname=None, start_urls=None, SERP_item=None, url=None, itemstatus=None, okstatus=None, title=None, *args, **kwargs):
super(SinglePageNonAJAXSpider, self).__init__(*args, **kwargs)
if myname:
self.name = myname
if start_urls:
self.start_urls = [start_urls] # Assuming only one URL
self.SERP_item = SERP_item
self.url = url
self.itemstatus = itemstatus
self.okstatus = okstatus
self.title = title
# Update 2: update the FEEDS value with the modified 'name'
self.custom_settings['FEEDS'] = {
f'\\scrapy\\JSON_output\\{self.name}.json': {
'format': 'json',
'encoding': 'utf8',
'fields': None,
'indent': 4,
'item_export_kwargs': {
'export_empty_fields': True,
},
},
}
def parse(self, response):
for listing in response.xpath(self.SERP_item):
listing_url = listing.xpath(self.url).get()
yield scrapy.Request(
url=response.urljoin(listing_url),
callback=self.parse_object,
)
def parse_object(self, response):
item = RentalItem()
item['url'] = response.url # get url
item['city'] = 'mycity'
yield item
管道.py
import json
class MyCustomPipeline(object):
def open_spider(self, spider):
self.items = []
def process_item(self, item, spider):
self.items.append(dict(item))
return item
中间件.py
from scrapy import signals
from itemadapter import is_item, ItemAdapter
class MySpiderMiddleware:
@classmethod
def from_crawler(cls, crawler):
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
return None
def process_spider_output(self, response, result, spider):
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
pass
def process_start_requests(self, start_requests, spider):
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class MyDownloaderMiddleware:
@classmethod
def from_crawler(cls, crawler):
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
return None
def process_response(self, request, response, spider):
return response
def process_exception(self, request, exception, spider):
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
settings.py
BOT_NAME = 'my_spiders'
SPIDER_MODULES = ['my_spiders.spiders']
NEWSPIDER_MODULE = 'my_spiders.spiders'
ROBOTSTXT_OBEY = False
SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100
}
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_useragents.downloadermiddlewares.useragents.UserAgentsMiddleware': 500,
'scrapy_selenium.SeleniumMiddleware': 800
}
from shutil import which
SELENIUM_DRIVER_NAME = 'chrome'
SELENIUM_DRIVER_EXECUTABLE_PATH = which('chromedriver')
SELENIUM_DRIVER_ARGUMENTS=['--headless']
#Configure item pipelines. See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'my_spiders.pipelines.MyCustomPipeline': 300,
}
FEED_EXPORT_FIELDS = [
'id', 'url', 'city', 'title'
]
SPLASH_URL = 'http://localhost:8050/'
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
更新2
我尝试了settings.py和custom_settings来设置您的设置。但在这两种情况下,它仍然没有使用我想要的文件输出名称。
当我在settings.py
中设置这个并执行___SPIDER_RUNNER.py
时:
FEEDS = {
'items.json': {
'format': 'json',
'encoding': 'utf8',
'fields': None,
'indent': 4,
'item_export_kwargs': {
'export_empty_fields': True,
},
},
}
根本不存储输出文件。
当我从设置中删除“FEEDS”并添加到类SinglePageNonAJAXSpider
时:
custom_settings = {
'FEEDS': {
f'{format}.json': { # using spider name
'format': 'json',
'encoding': 'utf8',
'fields': None,
'indent': 4,
'item_export_kwargs': {
'export_empty_fields': True,
},
},
},
}
它总是存储文件名whatever.json
,即使我从我的___SPIDER_RUNNER.py
传递所需的文件名“my_filename_variable”:
run_spider("my_filename_variable",
"https://quotes.toscrape.com",
"//div[@class='quote']/span/a[starts-with(@href, '/author/')]",
"./@href",
""''"",
"",
'//span[contains(@class, "author-born-date")]/text()')
我检查了你的参考页面上的饲料,但我不知道我需要改变什么。
1条答案
按热度按时间b0zn9rqh1#
1.删除所有以前关于FEEDS的设置
1.使用
settings.py
或custom_settings
设置您的设置。对于
settings.py
:对于
custom_settings
:1.运行spider,返回一些项目。
1.完成它或期望完成它。
1.检查,您的数据写入items.json文件中的根scrapy文件夹中。
1.如果可以,请尝试添加路径('
C:\scrapy\JSON_output\test.json
')而不是'items.json
'对你应该有帮助。如果不是这种情况,请通过
self.crawler.settings
检查spider的__init__
方法中的设置关于FEED设置的更多信息:https://docs.scrapy.org/en/latest/topics/feed-exports.html?highlight=FEED#settings