所以我想抓取多个URL的数据并检索所有信息。但是如果超过1个URL将是错误的(列表索引超出范围),我只能从1个URL抓取。并且我得到了将yield分成几个不同变量的信息。语法本身应该是什么样的?
进口废钢
类QuotesSpider(小蜘蛛):名称=“引号”
def start_requests(self):
urls = [
# 'https://jdih.kaltimprov.go.id/produk_hukum/detail/9ef7f994-9db4'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
yield{
'Kategori':response.xpath('//*[@class="text-left"]/text()')[0].extract(),
'Nomor':response.xpath('//*[@class="text-left"]/text()')[1].extract(),
'Judul':response.xpath('//*[@class="text-left"]/text()')[2].extract().strip(),
'Tanggal Diterapkan':response.xpath('//*[@class="text-left"]/text()')[3].extract(),
'Tanggal Diundangkan':response.xpath('//*[@class="text-left"]/text()')[4].extract(),
'Keterangan Status':response.xpath('//*[@class="text-left"]/p/text()')[0].extract(),
'Statistik View':response.xpath('//*[@class="text-left"]/text()')[5].extract(),
'Statistik Download':response.xpath('//*[@class="text-left"]/text()')[6].extract(),
'Katalog': response.xpath('//*[@class="text-left"]/p/span/text').extract(),
'Abstraksi' :response.xpath('//*[@class="text-left"]/p/text()')[1].extract(),
'Lampiran': response.css('body > section > div > div > div > div.row > div.col-3 > a::attr(href)').extract()
}
和错误
File "C:\Users\Prihantoro Tri N\OneDrive\Documents\file toro\MSIB\Magang\Hukumonline\Project\list_url\test1.py", line 28, in parse
'kategori' : response.css('body > section > div > div > div > div.row > div.col-9 > table > tr > td::text')[0].extract(),
File "C:\Users\Prihantoro Tri N\AppData\Local\Programs\Python\Python310\lib\site-packages\parsel\selector.py", line 70, in __getitem__
o = super(SelectorList, self).__getitem__(pos)
IndexError: list index out of range
2022-03-24 15:11:01 [scrapy.core.engine] INFO: Closing spider (finished)
2022-03-24 15:11:01 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 380,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 1761,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'elapsed_time_seconds': 0.888989,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2022, 3, 24, 8, 11, 1, 373605),
'httpcompression/response_bytes': 1606,
'httpcompression/response_count': 1,
'log_count/DEBUG': 10,
'log_count/ERROR': 1,
'log_count/INFO': 10,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'spider_exceptions/IndexError': 1,
'start_time': datetime.datetime(2022, 3, 24, 8, 11, 0, 484616)}
2022-03-24 15:11:01 [scrapy.core.engine] INFO: Spider closed (finished)
1条答案
按热度按时间0h4hbjxa1#
您是否尝试对该蜘蛛程序使用分页?如果是,请检查代码片段。它来自文档here