数据被覆盖,他们只给予最后一页的数据如何解决这些问题是这些的任何解决方案请推荐我我见过几个解决方案刮从一个网站的多个页面,但无法使它对我的代码工作。这些是页面链接https://www.benrishi-navi.com/english/english1_2.php
import scrapy
from scrapy import FormRequest
from scrapy.crawler import CrawlerProcess
from scrapy.http import Request
class TestSpider(scrapy.Spider):
name = 'test'
url = 'https://www.benrishi-navi.com/english/english1_2.php'
k=1
u=10
for n in range(5):
k += 10
u+= 10
payload='tuusan_year=&tuusan_month=&tuusan_chk=&methodAndOr1=&methodAndOr2=&methodAndOr3=&text_sen=&text_skill=&text_business=&tokkyo_data=&fuki_day_chk=&shuju=&kensyuu_bunya=&text_kensyuu=&methodAndOr_kensyuu=&keitai_kikan=&keitai_hisu=&display_flag=1&search=2&text=&method=&methodAndOr=&area=&pref=&name=&kana=&id=&year=&month=&day=&day_chk=&exp01=&exp02=&exp03=&trip=&venture_support=&venture_flag=&university_support=&university_flag=&university1=&university2=&university=&college=&high_pref=&junior_pref=&elementary_pref=&tyosaku=&hp=&jukoureki=&experience1=&experience2=&experience3=&experience4=&sort=&fuki_year=&fuki_month=&fuki_day=&fuki_day_chk=&id_chk=&shugyou=&fuki=&address1=&address2=&trip_pref=&expref=&office=&max_count=1438&search_count=10&start_count='+str(k)+'&search_default='+str(u)
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8,pt;q=0.7',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
'Cookie': 'CAKEPHP=u6u40lefkqnm45j49a5i0h6bs3; __utma=42336182.871903078.1657200864.1657200864.1657200864.1; __utmz=42336182.1657200864.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
'Origin': 'https://www.benrishi-navi.com',
'Referer': 'https://www.benrishi-navi.com/english/english1_2.php',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
'sec-ch-ua': '".Not/A)Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
def start_requests(self):
yield scrapy.FormRequest(
url=self.url,
method='POST',
body=self.payload,
headers=self.headers,
callback=self.parse_item,
)
def parse_item(self, response):
base_url="https://www.benrishi-navi.com/english/"
links =response.xpath("//table[4]//tr")
for link in links[1:]:
t=link.xpath("//form//@action").get()
u=link.xpath(".//input[@name='serial']//@value").get()
product=base_url+t+"?serial="+u+"&office_serial=&submit2=Details"
yield Request(product,callback=self.parse_book)
def parse_book(self,response):
name=response.xpath("normalize-space(//td[text()[contains(.,'Name')]]/following-sibling::td//text())").get()
telephone=response.xpath("normalize-space(//td[text()[contains(.,'TEL')]]/following-sibling::td//text())").get()
fax=response.xpath("normalize-space(//td[text()[contains(.,'FAX')]]/following-sibling::td//text())").get()
email=response.xpath("normalize-space(//td[text()[contains(.,'Email')]]/following-sibling::td//text())").get()
website=response.xpath("//td[text()[contains(.,'Website')]]/following-sibling::td//a[starts-with(@href, 'http')]/@href").get()
registration_date=response.xpath("normalize-space(//td[text()[contains(.,'Registration date')]]/following-sibling::td//text())").get()
firm=response.xpath("normalize-space(//td[text()[contains(.,'Firm Name')]]/following-sibling::td//text())").get()
address=response.xpath("normalize-space(//td[text()[contains(.,'Address (Prefecture)')]]/following-sibling::td//text())").get()
spec=response.xpath("normalize-space(//td[text()[contains(.,'Specialization')]]/following-sibling::td//text())").get().replace(" |","|")
tech=response.xpath("normalize-space(//td[text()[contains(.,'Technical field')]]/following-sibling::td//text())").get().replace(" |","|")
yield{
"name":name,
"Telephone":telephone,
"Fax":fax,
"Email":email,
"website":website,
"Registration_date":registration_date,
"Firm_name":firm,
"Address":address,
"Specialization":spec,
"Technical_field":tech
}
1条答案
按热度按时间cwdobuhd1#
要在单个发布请求中获取所有结果,请执行以下操作:
只需要清理数据输出和您的完成。
打印(SO的截断输出,将包含所有感兴趣的内容):
你也许可以稍微调整一下post请求,也可以得到隐藏的输出。祝你好运。