我有这个问题,我已经尝试了管道方法,但我不确定我是否做得正确的基础上教程,因为大多数挑选一些部分从response.body使用选择器。
然而,我可以在一个单独的脚本上解析它,该脚本提供了我所需要的所有数据,因为这些数据被其他变量弄乱了。
当它是一个单一的URL时我可以这样做,但是当我引入不同的URL时它会覆盖最终的解析。我相信如果我只需要response. body,可能会有一个更简单的解决方案,而不使用管道/Items.py。
请原谅上面的压痕,因为很难复制。
linkarr = df['URLOUT'].tolist()
today = datetime.today().strftime('%Y%m%d')
class MpvticketSpider(scrapy.Spider):
name = 'mpvticket'
start_urls = url
handle_httpstatus_list = [403,502,503,404]
def start_requests(self):
for url in linkarr:
eventid = str(url).strip().split("pid=")[1].split("&")[0]
filename_xml = str(eventid) + "_" + str(today) + ".xml"
filename_txt = str(eventid) + "_" + str(today) + ".txt"
print("\n FIRST URL BEING RUN: ",url)
pid = str(url).split("pid=")[1].split('&')[0]
username = 'XXXX'
password = 'XXXX'
port = 22225
session_id = random.random()
super_proxy_url = ('http://%s-country-us-session-%s:%s@zproxy.lum-superproxy.io:%d' %
(username, session_id, password, port))
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'max-age=0',
'referer': 'https://www.mlb.com/',
'sec-ch-ua': '".Not/A)Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
}
yield scrapy.Request(url, callback=self.parse_api,meta={'proxy': super_proxy_url},headers=headers)
def parse_api(self,response):
item = TicketsItem()
raw_data = response.body
soup = BeautifulSoup(raw_data,'lxml')
item['data'] = soup
yield item
#Commented portion was the original method. But overwrote my Output.xml
#try:
# with open(filename_xml, "w") as f:
# f.write(str(soup))
#except:
# with open(filename_txt, 'w') as f:
# f.write(str(soup))
if __name__ == '__main__':
process = CrawlerProcess()
process.crawl(MpvticketSpider)
process.start()
UPDATE:
Imports
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from sys import path
from scrapy.loader import itemloaders
path.append(r'D:\Projects\tickets')
from tickets.items import TicketsItem
class MpvticketSpider(scrapy.Spider):
name = 'mpvticket'
handle_httpstatus_list = [403,502,503,404]
def start_requests(self):
#for url in linkarr:
url = 'https://mpv.tickets.com/api/pvodc/v1/events/navmap/availability/?pid=9016692&agency=MLB_MPV&orgId=10&supportsVoucherRedemption=true'
print("\n FIRST URL BEING RUN: ",url)
username = 'XXXX'
password = 'XXXX'
port = 22225
session_id = random.random()
super_proxy_url = ('http://%s-country-us-session-%s:%s@zproxy.lum-superproxy.io:%d' %
(username, session_id, password, port))
headers = {
#headers
}
yield scrapy.Request(url, callback=self.parse_api,meta={'proxy': super_proxy_url})
def parse_api(self,response):
url = response.url
eventid = str(url).strip().split("pid=")[1].split("&")[0]
filename_xml = str(eventid) + "_" + str(today) + ".xml"
data = response.xpath("//body")
item = TicketsItem()
item['data'] = data
item['filename_xml'] = filename_xml
yield item
Pipelines.py
from re import I
from itemadapter import ItemAdapter
import scrapy
from scrapy.exceptions import DropItem
from scrapy.pipelines.files import FilesPipeline
class TicketsPipeline:
def process_item(self, item, spider):
for filename in item['filename_xml']:
with open(filename, "w") as fd:
fd.write(item['data'])
raise DropItem
item.py
import scrapy
from scrapy.loader import itemloaders
from itemloaders.processors import MapCompose
class TicketsItem(scrapy.Item):
filename_xml = scrapy.Field()
data = scrapy.Field()
Not sure what Is wrong but I am now getting the following error:
Traceback (most recent call last):
File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\crawler.py", line 206, in crawl
return self._crawl(crawler, *args,**kwargs)
File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\crawler.py", line 210, in _crawl
d = crawler.crawl(*args,**kwargs)
File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\twisted\internet\defer.py", line 1905, in unwindGenerator
return _cancellableInlineCallbacks(gen)
File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\twisted\internet\defer.py", line 1815, in _cancellableInlineCallbacks
_inlineCallbacks(None, gen, status)
--- <exception caught here> ---
File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\twisted\internet\defer.py", line 1660, in _inlineCallbacks
result = current_context.run(gen.send, result)
File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\crawler.py", line 102, in crawl
self.engine = self._create_engine()
File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\crawler.py", line 116, in _create_engine
return ExecutionEngine(self, lambda _: self.stop())
File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\core\engine.py", line 84, in __init__
self.scraper = Scraper(crawler)
File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\core\scraper.py", line 75, in __init__
self.itemproc = itemproc_cls.from_crawler(crawler)
File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\middleware.py", line 59, in from_crawler
return cls.from_settings(crawler.settings, crawler)
File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\middleware.py", line 40, in from_settings
mwcls = load_object(clspath)
File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\utils\misc.py", line 61, in load_object
mod = import_module(module)
File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\importlib\__init__.py", line 126, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
File "<frozen importlib._bootstrap_external>", line 883, in exec_module
File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
File "D:\Projects\tickets\tickets\pipelines.py", line 15, in <module>
class TicketsPipeline:
File "D:\Projects\tickets\tickets\pipelines.py", line 22, in TicketsPipeline
raise DropItem
scrapy.exceptions.DropItem:
1条答案
按热度按时间xhv8bpkk1#
您应该将文件名/输出路径的逻辑移到解析方法中,然后将其作为字段添加到生成的项中。然后,在项管道中,您可以将正文保存到输出路径并删除该项,因为此时不需要进一步处理。
因此将您parse方法更改为如下所示:
您需要将项目更改为类似以下的内容:
然后,您的项目管道可能如下所示: