有效的json管道Scrapy

hs1ihplo  于 2022-11-09  发布在  其他
关注(0)|答案(2)|浏览(190)

我以JSON格式输出scrapy数据。自定义scrapy管道以JSON格式输出字典列表。Item类型如下所示:

[{
    "product_id": "11980174",
    "brand_id": 25354,
    "brand_name": "Gucci",
    "title": "beige and brown Dionysus GG Supreme mini canvas shoulder bag",
    "slug": "/shopping/gucci-beige-and-brown-dionysus-gg-supreme-mini-canvas-shoulder-bag-11980174"
},
{
    "product_id": "17070807",
    "brand_id": 1168391,
    "brand_name": "Jonathan Adler",
    "title": "Clear acrylic chess set",
    "slug": "/shopping/jonathan-adler-clear-acrylic-chess-set-17070807"
},
{
    "product_id": "17022890",
    "brand_id": 3543122,
    "brand_name": "Anissa Kermiche",
    "title": "pink, green and red Mini Jugs Jug earthenware vase set",
    "slug": "/shopping/anissa-kermiche-pink-green-and-red-mini-jugs-jug-earthenware-vase-set-17022890"
},]

但是我想以有效的json格式导出数据:

[{
    "product_id": "11980174",
    "brand_id": 25354,
    "brand_name": "Gucci",
    "title": "beige and brown Dionysus GG Supreme mini canvas shoulder bag",
    "slug": "/shopping/gucci-beige-and-brown-dionysus-gg-supreme-mini-canvas-shoulder-bag-11980174"
},
{
    "product_id": "17070807",
    "brand_id": 1168391,
    "brand_name": "Jonathan Adler",
    "title": "Clear acrylic chess set",
    "slug": "/shopping/jonathan-adler-clear-acrylic-chess-set-17070807"
},
{
    "product_id": "17022890",
    "brand_id": 3543122,
    "brand_name": "Anissa Kermiche",
    "title": "pink, green and red Mini Jugs Jug earthenware vase set",
    "slug": "/shopping/anissa-kermiche-pink-green-and-red-mini-jugs-jug-earthenware-vase-set-17022890"
}]

我需要删除最后一个json对象中的逗号,使其成为有效的json。
下面是我自定义的Scrapy Json管道:

from scrapy import signals
import boto3
from scrapy.utils.project import get_project_settings
import time
import json

class JsonWriterPipeline(object):
    def __init__(self):
        self.spider_time = f'{time.strftime("%Y/%G_%m/%Y.%m.%d/%Y.%m.%d")}'

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        self.file = open("%s_items.json" % spider.name, "w")
        self.file.write("[")

    def process_item(self, item, spider):
        line = line = json.dumps(dict(item), indent=4) + ",\n"
        self.file.write(line)
        return item

    def spider_closed(self, spider):
        self.file.write("]")
        self.file.close()
        settings = get_project_settings()
        my_session = boto3.session.Session()
        s3 = my_session.resource(
            "s3",
            endpoint_url=settings.get("AWS_ENDPOINT_URL"),
            aws_access_key_id=settings.get("AWS_ACCESS_KEY_ID"),
            aws_secret_access_key=settings.get("AWS_SECRET_ACCESS_KEY"),
        )
        boto_test_bucket = s3.Bucket(settings.get("AWS_STORAGE_BUCKET_NAME"))
        boto_test_bucket.upload_file(
            "%s_items.json" % spider.name,
            f"brownsfashion-feeds/{spider.name}_{self.spider_time}.json",
        )

请告诉我任何解决办法。谢谢。

a11xaf1n

a11xaf1n1#

不要尝试自己将字典转换成JSON,而是使用json包中的json.dumps()

import json

data = [{
    "product_id": "11980174",
    "brand_id": 25354,
    "brand_name": "Gucci",
    "title": "beige and brown Dionysus GG Supreme mini canvas shoulder bag",
    "slug": "/shopping/gucci-beige-and-brown-dionysus-gg-supreme-mini-canvas-shoulder-bag-11980174"
},
{
    "product_id": "17070807",
    "brand_id": 1168391,
    "brand_name": "Jonathan Adler",
    "title": "Clear acrylic chess set",
    "slug": "/shopping/jonathan-adler-clear-acrylic-chess-set-17070807"
},
{
    "product_id": "17022890",
    "brand_id": 3543122,
    "brand_name": "Anissa Kermiche",
    "title": "pink, green and red Mini Jugs Jug earthenware vase set",
    "slug": "/shopping/anissa-kermiche-pink-green-and-red-mini-jugs-jug-earthenware-vase-set-17022890"
}]

valid_json = json.dumps(data)
print(valid_json)
t5zmwmid

t5zmwmid2#

您可以像这样重写代码,

class JsonWriterPipeline(object):
    def __init__(self):
        self.spider_time = f'{time.strftime("%Y/%G_%m/%Y.%m.%d/%Y.%m.%d")}'

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        self.file = open("%s_items.json" % spider.name, "w")

    def process_item(self, item, spider):
        json.dump(item, self.file) # You can use the json.dump it will directly write into your file
        return item

    def spider_closed(self, spider):
        self.file.close()
        settings = get_project_settings()
        my_session = boto3.session.Session()
        s3 = my_session.resource(
            "s3",
            endpoint_url=settings.get("AWS_ENDPOINT_URL"),
            aws_access_key_id=settings.get("AWS_ACCESS_KEY_ID"),
            aws_secret_access_key=settings.get("AWS_SECRET_ACCESS_KEY"),
        )
        boto_test_bucket = s3.Bucket(settings.get("AWS_STORAGE_BUCKET_NAME"))
        boto_test_bucket.upload_file(
            "%s_items.json" % spider.name,
            f"brownsfashion-feeds/{spider.name}_{self.spider_time}.json",
        )

相关问题