scrapy 使用FormRequest通过HTTP POST提取数据

axzmvihb  于 2023-05-29  发布在  其他
关注(0)|答案(1)|浏览(134)

使用FormRequest通过HTTP POST提取数据
嘿,伙计们,我将抓取网站https://bitsclassic.com/fa/的所有产品的详细信息与scrapy要获得产品的URL,我必须发送一个POST请求到Web服务https://bitsclassic.com/fa/Product/ProductList我这样做了,但它不输出!如何发布请求?

class BitsclassicSpider(scrapy.Spider):
    name = "bitsclassic"
    start_urls = ['https://bitsclassic.com/fa']

    def parse(self, response):
        """
        This method is the default callback function that will be
        executed when the spider starts crawling the website.
        """
        category_urls = response.css('ul.children a::attr(href)').getall()[1:]
        for category_url in category_urls:
            yield scrapy.Request(category_url, callback=self.parse_category)

    def parse_category(self, response):
        """
        This method is the callback function for the category requests.
        """
        category_id = re.search(r"/(\d+)-", response.url).group(1)
        num_products = 1000

        # Create the form data for the POST request
        form_data = {
            'Cats': str(category_id),
            'Size': str(num_products)
        }

        # Send a POST request to retrieve the product list
        yield FormRequest(
            url='https://bitsclassic.com/fa/Product/ProductList',
            method='POST',
            formdata=form_data,
            callback=self.parse_page
        )

    def parse_page(self, response):
        """
        This method is the callback function for the product page requests.
        """
        # Extract data from the response using XPath or CSS selectors
        title = response.css('p[itemrolep="name"]::text').get()
        url = response.url
        categories = response.xpath('//div[@class="con-main"]//a/text()').getall()
        price = response.xpath('//div[@id="priceBox"]//span[@data-role="price"]/text()').get()

        # Process the extracted data
        if price is not None:
            price = price.strip()
            product_exist = True
        else:
            price = None
            product_exist = False

        # Create a new item with the extracted data
        item = BitsclassicItem()
        item["title"] = title.strip()
        item["categories"] = categories[3:-1]
        item["product_exist"] = product_exist
        item["price"] = price
        item["url"] = response.url
        item["domain"] = "bitsclassic.com/fa"

        # Yield the item to pass it to the next pipeline stage for further processing
        yield item

我怀疑我提出要求的方式是否正确?

enyaitl3

enyaitl31#

这个要求很好。
你还有几个其他的问题。
1.您从表单请求中得到的响应是一个JSON响应,您需要像对待JSON响应那样对待它,而不是HTML响应。
1.你只能从每一页的第一个项目。你可以使用for循环。
1.你可以做一些事情来改进你的代码,我做了其中的一些。

import scrapy
from scrapy import FormRequest
from scrapy.http import HtmlResponse

class BitsclassicSpider(scrapy.Spider):
    name = "bitsclassic"
    start_urls = ['https://bitsclassic.com/fa']

    def parse(self, response):
        """
        This method is the default callback function that will be
        executed when the spider starts crawling the website.
        """

        category_urls = response.css('ul.children a')
        for category in category_urls[1:]:
            category_url = category.css('::attr(href)').get()
            category_id = category.re(r"/(\d+)-")[0]
            yield scrapy.Request(category_url, callback=self.parse_category, cb_kwargs={'category_id': category_id})

    def parse_category(self, response, category_id):
        """
        This method is the callback function for the category requests.
        """
        num_products = 1000

        # Create the form data for the POST request
        form_data = {
            'Cats': str(category_id),
            'Size': str(12)
        }

        page = 1
        form_data['Page'] = str(page)

        yield FormRequest(
            url='https://bitsclassic.com/fa/Product/ProductList',
            method='POST',
            formdata=form_data,
            callback=self.parse_page,
            cb_kwargs={'url': response.url, 'form_data': form_data, 'page': page}
        )

    def parse_page(self, response, url, form_data, page):
        """
        This method is the callback function for the product page requests.
        """

        json_data = response.json()
        if not json_data:
            return

        html = json_data.get('Html', '')
        if not html.strip():
            return

        html_res = HtmlResponse(url=url, body=html, encoding='utf-8')

        for product in html_res.xpath('//div[@itemrole="item"]'):
            # Extract data from the response using XPath or CSS selectors
            title = product.css('span[itemrole="name"]::text').get(default='').strip()
            # you need to check how to get the categories
            # categories = product.xpath('//div[@class="con-main"]//a/text()').getall()
            price = product.xpath('//span[@class="price"]/text()').get(default='').strip()
            product_url = product.xpath('//a[@itemrole="productLink"]/@href').get()

            # Process the extracted data
            product_exist = True if price else False

            # Create a new item with the extracted data
            item = BitsclassicItem()
            item["title"] = title
            # item["categories"] = categories[3:-1]
            item["product_exist"] = product_exist
            item["price"] = price
            item["url"] = product_url
            item["domain"] = "bitsclassic.com/fa"

            # Yield the item to pass it to the next pipeline stage for further processing
            yield item

        # pagination
        page += 1
        form_data['Page'] = str(page)

        yield FormRequest(
            url='https://bitsclassic.com/fa/Product/ProductList',
            method='POST',
            formdata=form_data,
            callback=self.parse_page,
            cb_kwargs={'url': url, 'form_data': form_data, 'page': page}
        )

相关问题