scrapy 错误HTTP状态代码未处理或不允许

q5iwbnjs  于 2022-11-09  发布在  其他
关注(0)|答案(1)|浏览(149)

我试图从json获得数据,但他们给予我错误,HTTP status code is not handled or not allowed有任何解决方案如何处理这些错误在scrappy什么是这些错误会发生的原因是,许多请求发生,为什么他们显示这些错误,这是页面链接https://www.nationalhardwareshow.com/en-us/attend/exhibitor-list.html

import scrapy
from scrapy import FormRequest
from scrapy.crawler import CrawlerProcess
from scrapy.http import Request

class TestSpider(scrapy.Spider):
    name = 'test'
    url="https://xd0u5m6y4r-dsn.algolia.net/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query"
    headers = {
    'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8,pt;q=0.7',
    'Connection': 'keep-alive',
    'Origin': 'https://www.nationalhardwareshow.com',
    'Referer': 'https://www.nationalhardwareshow.com/',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'cross-site',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
    'accept': 'application/json',
    'content-type': 'application/x-www-form-urlencoded',
    'sec-ch-ua': '".Not/A)Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    }

    params = {
        'x-algolia-agent': 'Algolia for vanilla JavaScript 3.27.1',
        'x-algolia-application-id': 'XD0U5M6Y4R',
        'x-algolia-api-key': 'd5cd7d4ec26134ff4a34d736a7f9ad47',
        'params':'query=&page=0&facetFilters=&optionalFilters=%5B%5D',
    }

    def start_requests(self):
        yield scrapy.FormRequest(
            url=self.url,
            method='POST',
            headers=self.headers,
            formdata=self.params,
            callback=self.parse,
        )

    def parse(self,response):
        print(response.json())  
    import scrapy
    from scrapy import FormRequest
    from scrapy.crawler import CrawlerProcess
    from scrapy.http import Request

    class TestSpider(scrapy.Spider):
        name = 'test'
        url="https://xd0u5m6y4r-dsn.algolia.net/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query"
        headers = {
        'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8,pt;q=0.7',
        'Connection': 'keep-alive',
        'Origin': 'https://www.nationalhardwareshow.com',
        'Referer': 'https://www.nationalhardwareshow.com/',
        'Sec-Fetch-Dest': 'empty',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'cross-site',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
        'accept': 'application/json',
        'content-type': 'application/x-www-form-urlencoded',
        'sec-ch-ua': '".Not/A)Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        }

        params = {
            'x-algolia-agent': 'Algolia for vanilla JavaScript 3.27.1',
            'x-algolia-application-id': 'XD0U5M6Y4R',
            'x-algolia-api-key': 'd5cd7d4ec26134ff4a34d736a7f9ad47',
            'params':'query=&page=0&facetFilters=&optionalFilters=%5B%5D',
        }

        def start_requests(self):
            yield scrapy.FormRequest(
                url=self.url,
                method='POST',
                headers=self.headers,
                formdata=self.params,
                callback=self.parse,
            )

        def parse(self,response):
            print(response.json())
r1zk6ea1

r1zk6ea11#

你得到的是HTTP status code is not handled or not allowed,因为头文件和参数太多了。

import scrapy
import json
from scrapy.crawler import CrawlerProcess
class TestSpider(scrapy.Spider):
    name = 'test'

    custom_settings = {
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
        'DOWNLOAD_DELAY': 1
        }

    def start_requests(self):
        data={"params":"query=&page=0&facetFilters=&optionalFilters=%5B%5D"}
        headers= {
            'Content-Type': 'application/x-www-form-urlencoded',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'

        }
        api_url='https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia%20for%20vanilla%20JavaScript%203.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47'
        yield scrapy.Request(
            url= api_url,
            method='POST',
            headers=headers,
            body=json.dumps(data),
            callback=self.parse
            )

    def parse(self, response):
        resp = json.loads(response.body)
        for item in resp['hits']:
            yield {
                'Title':item['companyName']
            } 

if __name__ == "__main__":
    process = CrawlerProcess(TestSpider)
    process.crawl()
    process.start()

输出:

{'Title': 'Bug Bite Thing'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia%20for%20vanilla%20JavaScript%203.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'BULA'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia%20for%20vanilla%20JavaScript%203.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'Bunnik Creations'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia%20for%20vanilla%20JavaScript%203.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'McCordick Glove & Safety Inc'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia%20for%20vanilla%20JavaScript%203.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'Burro Creative Solutions'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia%20for%20vanilla%20JavaScript%203.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'Bytech/Case Logic USA'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia%20for%20vanilla%20JavaScript%203.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'Cable Lasso'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia%20for%20vanilla%20JavaScript%203.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'Caframo Ltd'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia%20for%20vanilla%20JavaScript%203.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'California Air Tools'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia%20for%20vanilla%20JavaScript%203.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'Calloway Mills/Home and More'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia%20for%20vanilla%20JavaScript%203.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'Camp Chef'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia%20for%20vanilla%20JavaScript%203.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'Canadian Spa Company'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia%20for%20vanilla%20JavaScript%203.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'CAPS-LOCK'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia%20for%20vanilla%20JavaScript%203.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'Carson LLC'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia%20for%20vanilla%20JavaScript%203.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'Cascade Holdings'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia%20for%20vanilla%20JavaScript%203.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'Catania Oils'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia%20for%20vanilla%20JavaScript%203.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'CCH Products'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia%20for%20vanilla%20JavaScript%203.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'CedarCraft'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia%20for%20vanilla%20JavaScript%203.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': "Central Garden & Pet/Pennington/Howard Johnson's Enterprises"}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia%20for%20vanilla%20JavaScript%203.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'Centrex Plastic LLC./American Plastics'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia%20for%20vanilla%20JavaScript%203.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'Chaby International'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia%20for%20vanilla%20JavaScript%203.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'Changzhou Feiwang Tool Co.,Ltd.'}
2022-07-10 04:59:07 [scrapy.core.engine] INFO: Closing spider (finished)
2022-07-10 04:59:07 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 643,
 'downloader/request_count': 1,
 'downloader/request_method_count/POST': 1,
 'downloader/response_bytes': 117197,
 'downloader/response_count': 1,
 'downloader/response_status_count/200': 1,
 'elapsed_time_seconds': 3.180524,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2022, 7, 9, 22, 59, 7, 202813),
 'httpcompression/response_bytes': 765918,
 'httpcompression/response_count': 1,
 'item_scraped_count': 100,

相关问题