Scrapy -当没有找到可下载的文件时该怎么办?

0dxa2lsx  于 2022-11-09  发布在  其他
关注(0)|答案(1)|浏览(177)

我目前正在工作的一个零碎的程序,有可用性下载文件从我刮的页面,我目前遇到的问题是,一些页面有一个数据表像这个页面-https://www.tyconsystems.com/rpms24-720-720-而其他人不喜欢这个页面-https://www.tyconsystems.com/tpdin-cable-232-
当页面上找不到文件时,传递数据的正确方法是什么?附加问题,当项目数据长度过长时,是否有任何方法可以解决csv文件中每个项目有多行的问题?例如项目-rpms 24 -720-720。
下面是我正在使用的代码。
productInfo.py

from copyreg import clear_extension_cache
import scrapy
from ..items import tyconItem

class ProductInfoSpider(scrapy.Spider):

    name = "productInfo"
    allowed_domains = ['tyconsystems.com']
    start_urls = [
        'https://www.tyconsystems.com/rpms24-720-720',
        'https://www.tyconsystems.com/tpdin-cable-232',
    ]

    def parse(self, response):
        for product in response.css('section#listing'):
            items = tyconItem() # Unique item for each iteration
            name_dirty = product.css('div.product-id span#product_id::text').get()
            product_sku = name_dirty.strip()
            product_sub_title_dirty = product.css('div.product-details h1.page_headers::text').get()
            product_sub_title = product_sub_title_dirty.strip()
            #product_store_description = product.css('p.series-card__intro').get() 
            if product.xpath('//p[contains(@class, "MsoNormal")]'):
                summary = product.css('div.item > div p.MsoNormal').getall()
            elif product.xpath('//div[contains(@class, "item")]/div'):
                summary = product.css('div.item > div').getall()
            else:
                summary = product.css('div.item').getall()
            category_list = product.xpath('//div[@class="container"]//ol//li//a/span//text()').getall()
            category = category_list[-2].strip()
            description =   product.css('div.item > p.MsoNormal::text').getall()
            if product.css('div.extrafieldsBlock span.info a::attr(href)').get() == '':
                datasheet = 'no-file'
            else:
                datasheet = product.css('div.extrafieldsBlock span.info a::attr(href)').get()
            file_urls = datasheet
            specification = product.css('div#tab-6 div.info > table').getall()
            price = product.css('span#price::text').get()
            products_zoom_image = name_dirty.strip() + '.jpg'
            main_image = product.css('div#addl-images a::attr(href)').getall()
            image_urls = [response.urljoin(i) for i in main_image]

            items['category'] = category,
            items['datasheet'] = datasheet,
            items['description'] = description,
            items['main_image'] = main_image,
            items['price'] = price,
            items['product_link'] = response.url, # get the product link from response
            items['product_sku'] = product_sku,
            items['product_sub_title'] = product_sub_title,
            items['products_zoom_image'] = products_zoom_image
            items['specification'] = specification,
            items['summary'] = summary,

            items['file_urls'] = [file_urls]
            items["name"] = product_sku
            items["image_urls"] = image_urls

            yield items

items.py


# Define here the models for your scraped items

# 

# See documentation in:

# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy
from scrapy import Field, Item

class tyconItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()

    category            = scrapy.Field()
    datasheet           = scrapy.Field()
    description         = scrapy.Field()
    file_urls           = scrapy.Field()
    files               = scrapy.Field()
    name                = scrapy.Field()
    image_urls          = scrapy.Field()
    images              = scrapy.Field()
    main_image          = scrapy.Field()
    price               = scrapy.Field()
    product_link        = scrapy.Field()
    product_sku         = scrapy.Field()
    product_sub_title   = scrapy.Field()
    products_zoom_image = scrapy.Field()
    specification       = scrapy.Field()
    summary             = scrapy.Field()

pipelines.py


# Define your item pipelines here

# 

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

# useful for handling different item types with a single interface

# from scrapy.pipelines.images import ImagesPipeline

from scrapy.http import Request
from scrapy.pipelines.files import FilesPipeline
from scrapy.pipelines.images import ImagesPipeline
from io import BytesIO
from PIL import Image

class tyconPipeline:
    def process_item(self, item, spider):
        return item

class DownfilesPipeline(FilesPipeline):
    def file_path(self, request, response=None, info=None):
        file_name: str = request.url.split("/")[-1]
        return file_name

class ImagePipeline(ImagesPipeline):

    def file_path(self, request, response=None, info=None, *args, item=None):
        filename = request.meta["filename"].strip()
        number = request.meta["file_num"]
        return filename + "_" + str(number) + ".jpg"

    def thumb_path(self, request, thumb_id, response=None, info=None):
        filename = request.meta["filename"]
        number = request.meta["file_num"]
        return f'thumbs/{thumb_id}/{filename}_{str(number)}.jpg'

    def get_media_requests(self, item, info):
        name = item["name"]
        for i, url in enumerate(item["image_urls"]):
            meta = {"filename": name, "file_num": i}
            yield Request(url, meta=meta)

    def convert_image(self, image, size=None):
        if size is not None:   
            # If the size is not None then it is a thumbnail
            # so we resize it according the parameter
            image = image.resize(size, Image.ANTIALIAS)
        else:
            # otherwise we give the image to back to the superclass version of 
            # this method for it to process.
            return super().convert_image(image, size=size)  
        buf = BytesIO()  #  These next 3 lines are from the scrapy source code.
        image.save(buf, 'JPEG', quality=72)  
        return image, buf

日志中出现Scrapy错误

Traceback (most recent call last):
  File "/usr/lib/python3/dist-packages/twisted/internet/defer.py", line 857, in _runCallbacks
    current.result = callback(  # type: ignore[misc]
  File "/usr/lib/python3/dist-packages/scrapy/utils/defer.py", line 162, in f
    return deferred_from_coro(coro_f(*coro_args,**coro_kwargs))
  File "/usr/lib/python3/dist-packages/scrapy/pipelines/media.py", line 87, in process_item
    requests = arg_to_iter(self.get_media_requests(item, info))
  File "/usr/lib/python3/dist-packages/scrapy/pipelines/files.py", line 492, in get_media_requests
    return [Request(u) for u in urls]
  File "/usr/lib/python3/dist-packages/scrapy/pipelines/files.py", line 492, in <listcomp>
    return [Request(u) for u in urls]
  File "/usr/lib/python3/dist-packages/scrapy/http/request/__init__.py", line 60, in __init__
    self._set_url(url)
  File "/usr/lib/python3/dist-packages/scrapy/http/request/__init__.py", line 98, in _set_url
    raise TypeError(f"Request url must be str, got {type(url).__name__}")
TypeError: Request url must be str, got NoneType

谢谢大家!

cuxqih21

cuxqih211#

有两种可能的方法:

1.覆盖get_media_requests

重写管道中的get_media_requests以检查URL是否存在,如下所示:

class DownfilesPipeline(FilesPipeline):
    def get_media_requests(self, item, info):
        urls = ItemAdapter(item).get(self.files_urls_field, [])
        if not all(urls):
            return #THIS - Don't return Request if there is no URL
        return [Request(u) for u in URLs]
    # Rest of the code

class ImagePipeline(ImagesPipeline):
     def get_media_requests(self, item, info):
        urls = item.get("image_urls")
        if not all(urls):
            return None #THIS - Don't return Request if there is no URL
        name = item["name"]
        for i, url in enumerate(item["image_urls"]):
            meta = {"filename": name, "file_num": i}
            yield Request(url, meta=meta)

2.返回不同的项目

你可以根据你是否有图片下载,从Spider返回不同的条目类型。为了方便起见,我更喜欢使用匿名字典,如下所示:

def parse(self, response)
            item={}
            items['category'] = category,
            items['datasheet'] = datasheet,
            ...
            if file_to_download:
                items['file_urls'] = [file_urls]
            if image_to_download:
                items['image_urls'] = [image_urls]

            items["name"] = product_sku
            items["image_urls"] = image_urls

            yield item

希望能有所帮助!

相关问题