scrapy 擦除多个页面,但数据被覆盖

vnjpjtjt  于 2022-11-09  发布在  其他
关注(0)|答案(1)|浏览(133)

数据被覆盖,他们只给予最后一页的数据如何解决这些问题是这些的任何解决方案请推荐我我见过几个解决方案刮从一个网站的多个页面,但无法使它对我的代码工作。这些是页面链接https://www.benrishi-navi.com/english/english1_2.php

import scrapy
from scrapy import FormRequest
from scrapy.crawler import CrawlerProcess
from scrapy.http import Request

class TestSpider(scrapy.Spider):
    name = 'test'
    url = 'https://www.benrishi-navi.com/english/english1_2.php'

    k=1
    u=10
    for n in range(5):
        k += 10
        u+= 10 
        payload='tuusan_year=&tuusan_month=&tuusan_chk=&methodAndOr1=&methodAndOr2=&methodAndOr3=&text_sen=&text_skill=&text_business=&tokkyo_data=&fuki_day_chk=&shuju=&kensyuu_bunya=&text_kensyuu=&methodAndOr_kensyuu=&keitai_kikan=&keitai_hisu=&display_flag=1&search=2&text=&method=&methodAndOr=&area=&pref=&name=&kana=&id=&year=&month=&day=&day_chk=&exp01=&exp02=&exp03=&trip=&venture_support=&venture_flag=&university_support=&university_flag=&university1=&university2=&university=&college=&high_pref=&junior_pref=&elementary_pref=&tyosaku=&hp=&jukoureki=&experience1=&experience2=&experience3=&experience4=&sort=&fuki_year=&fuki_month=&fuki_day=&fuki_day_chk=&id_chk=&shugyou=&fuki=&address1=&address2=&trip_pref=&expref=&office=&max_count=1438&search_count=10&start_count='+str(k)+'&search_default='+str(u)

        headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8,pt;q=0.7',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Content-Type': 'application/x-www-form-urlencoded',
        'Cookie': 'CAKEPHP=u6u40lefkqnm45j49a5i0h6bs3; __utma=42336182.871903078.1657200864.1657200864.1657200864.1; __utmz=42336182.1657200864.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
        'Origin': 'https://www.benrishi-navi.com',
        'Referer': 'https://www.benrishi-navi.com/english/english1_2.php',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-Fetch-User': '?1',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
        'sec-ch-ua': '".Not/A)Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"'
        }

        def start_requests(self):
            yield scrapy.FormRequest(
                url=self.url,
                method='POST',
                body=self.payload,
                headers=self.headers,
                callback=self.parse_item,
                    )

        def parse_item(self, response):
            base_url="https://www.benrishi-navi.com/english/"
            links =response.xpath("//table[4]//tr")
            for link in links[1:]:
                t=link.xpath("//form//@action").get()
                u=link.xpath(".//input[@name='serial']//@value").get()
                product=base_url+t+"?serial="+u+"&office_serial=&submit2=Details"
                yield Request(product,callback=self.parse_book)

        def parse_book(self,response):
            name=response.xpath("normalize-space(//td[text()[contains(.,'Name')]]/following-sibling::td//text())").get()

            telephone=response.xpath("normalize-space(//td[text()[contains(.,'TEL')]]/following-sibling::td//text())").get()

            fax=response.xpath("normalize-space(//td[text()[contains(.,'FAX')]]/following-sibling::td//text())").get()

            email=response.xpath("normalize-space(//td[text()[contains(.,'Email')]]/following-sibling::td//text())").get()

            website=response.xpath("//td[text()[contains(.,'Website')]]/following-sibling::td//a[starts-with(@href, 'http')]/@href").get()

            registration_date=response.xpath("normalize-space(//td[text()[contains(.,'Registration date')]]/following-sibling::td//text())").get()

            firm=response.xpath("normalize-space(//td[text()[contains(.,'Firm Name')]]/following-sibling::td//text())").get()

            address=response.xpath("normalize-space(//td[text()[contains(.,'Address (Prefecture)')]]/following-sibling::td//text())").get()

            spec=response.xpath("normalize-space(//td[text()[contains(.,'Specialization')]]/following-sibling::td//text())").get().replace(" |","|")

            tech=response.xpath("normalize-space(//td[text()[contains(.,'Technical field')]]/following-sibling::td//text())").get().replace(" |","|")

            yield{
            "name":name,
            "Telephone":telephone,
            "Fax":fax,
            "Email":email,
            "website":website,
            "Registration_date":registration_date,
            "Firm_name":firm,
            "Address":address,
            "Specialization":spec,
            "Technical_field":tech
            }
cwdobuhd

cwdobuhd1#

要在单个发布请求中获取所有结果,请执行以下操作:

import requests

cookies = {}
headers = {}
data = {
    'submit2': 'Display',
    'search_default': '1500',
    'sort': '',
    'tuusan_year': '',
    'tuusan_month': '',
    'tuusan_chk': '',
    'methodAndOr1': '',
    'methodAndOr2': '',
    'methodAndOr3': '',
    'text_sen': '',
    'text_skill': '',
    'text_business': '',
    'tokkyo_data': '',
    'fuki_day_chk': '',
    'shuju': '',
    'kensyuu_bunya': '',
    'text_kensyuu': '',
    'methodAndOr_kensyuu': '',
    'keitai_kikan': '',
    'keitai_hisu': '',
    'search': '2',
    'area': '',
    'pref': '',
    'name': '',
    'kana': '',
    'id': '',
    'year': '',
    'month': '',
    'day': '',
    'address1': '',
    'address2': '',
    'trip_pref': '',
    'day_chk': '',
    'exp01': '',
    'exp02': '',
    'exp03': '',
    'trip': '',
    'venture_support': '',
    'venture_flag': '',
    'university_support': '',
    'university_flag': '',
    'university': '',
    'college': '',
    'high_pref': '',
    'junior_pref': '',
    'elementary_pref': '',
    'tyosaku': '',
    'hp': '',
    'jukoureki': '',
    'experience1': '',
    'experience2': '',
    'experience3': '',
    'experience4': '',
    'fuki_year': '',
    'fuki_month': '',
    'fuki_day': '',
    'display_flag': '1',
    'id_chk': '',
    'shugyou': '',
    'fuki': '',
    'expref': '',
    'office': '',
    'start_count': '1',
    'end_count': '10',
    'max_count': '1438',
    'search_count': '10',
    's': '1',
    'shozaichi': '',
}

response = requests.post('https://www.benrishi-navi.com/english/english1_2.php', cookies=cookies, headers=headers, data=data)
response.text

只需要清理数据输出和您的完成。
打印(SO的截断输出,将包含所有感兴趣的内容):

'\ufeff<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">\n<html lang="ja">\n<head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">\n<title>Japan Patent Attorney Search</title>\n\n<link rel="stylesheet" href="../01.css" type="text/css">\n<script language="JavaScript" src="../js/win.js"></script>\n<script language="JavaScript" type="text/JavaScript">\n<!--\nfunction kensakuClick(){//検索\n  document.form.action=\'./english1_2.php\';\n  document.form.submit();\n}\nfunction siborikomiClick(){//更に絞り込む\n  document.form.action=\'../english/english1.php\';\n  document.form.submit();\n}\n//-->\n</script>\n<style type="text/css">\n<!--\n.style3 {color: #FFFFFF}\n-->\n</style>\n</head>\n<body>\n<table width="750" border="0" cellspacing="0" cellpadding="0">\n  <tr>\n    <td width="750"><div align="center"><img src="../images/head2.gif" alt="日本弁理士会" width="750" height="12"><br>\n\n     <!--パンくずリスト-->\n     <table width="690" border="0" align="center" cellpadding="0" cellspacing="0">\n        <tr height="25">\n          <td width="670" height="25"><a href="/" class="fontsizeset" >Home(Japanese)</a>\u3000&gt;\n      <a href="./english1.php?search=2" class="fontsizeset" >Japan Patent Attorney Search</a>\n    \u3000</td>\n        </tr>\n        <tr>\n          <td width="670"></td>\n        </tr>\n      </table>\n\n        <!--「マルチ検索(弁理士)」タブ-->\n        <!--<table width="690" border="0" cellspacing="0" cellpadding="0">\n          <tr>\n            <td><img src="images/tab.gif" alt="マルチ検索(弁理士)" width="230" height="48" border="0"></td>\n          </tr>\n        </table>-->\n        <table width="690" border="0" align="center" cellpadding="20" cellspacing="4" bgcolor="#ffa48c">\n          <tr>\n            <td valign="top" width="642" height="487" bgcolor="#ffede3"><div align="center">\n                                        <table width="608" border="0" cellspacing="2" cellpadding="0">\n                  <tr height="29">\n                    <td width="596" height="29"><span class="f14"><strong>■Search results</strong></span></td>\n                  </tr>\n                </table>\n                <br>\n                <table width="608" border="0" cellspacing="1" cellpadding="10" bgcolor="#a8a8a8">\n                  <tr height="12">\n                    <td bgcolor="white" height="12"><table width="100%">\n                        <tr>\n                          <td>\n                            1438                            matches found using these search parameters</td>\n                        </tr>\n                    </table></td>\n                  </tr>\n                </table>\n                 <form method="POST" name="form">\n                  <table width="608" border="0" cellspacing="2" cellpadding="0">\n                    <!--1行目-->\n                    <tr height="20">\n                      <td height="20">\n                        <table width="100%">\n                          <tr>\n                            <!--「~件中~件表示」-->\n                                                         <td width="320">Displaying results 1 -1438 of 1438                            </td>\n                            <td width="60"></td>\n                            <td width="200"></td>\n                          </tr>\n                        </table>\n                      </td>\n                    </tr>\n\n                    <!--2行目-->\n                    <tr height="20">\n                      <td height="20">\n                        <table width="100%">\n

你也许可以稍微调整一下post请求,也可以得到隐藏的输出。祝你好运。

相关问题