正如标题所暗示的,我正在运行一个Scrapy spider,并将结果存储在MongoDB中。一切都运行得很顺利,只是当我重新运行spider时,它会再次添加所有内容,我不希望重复。我的pipelines.py
文件如下所示:
import logging
import pymongo
from pymongo import MongoClient
from scrapy.conf import settings
from scrapy import log
class MongoPipeline(object):
collection_name = 'openings'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
## pull in information from settings.py
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
)
def open_spider(self, spider):
## initializing spider
## opening db connection
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
## clean up when spider is closed
self.client.close()
def process_item(self, item, spider):
## how to handle each post
if self.db.openings.find({' quote_text': item['quote_text']}) == True:
pass
else:
self.db[self.collection_name].insert(dict(item))
logging.debug("Post added to MongoDB")
return item
我的蜘蛛看起来像这样:
import scrapy
from ..items import QuotesItem
class QuoteSpider(scrapy.Spider):
name = 'quote'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/']
def parse(self, response):
items = QuotesItem()
quotes = response.xpath('//*[@class="quote"]')
for quote in quotes:
author = quote.xpath('.//*[@class="author"]//text()').extract_first()
quote_text = quote.xpath('.//span[@class="text"]//text()').extract_first()
items['author'] = author
items['quote_text'] = quote_text
yield items
当前的语法显然是错误的,但是是否可以对for循环进行一些小的修改来修复它呢?我是否应该在spider中运行这个循环呢?我也在研究upsert,但是在理解如何有效地使用它方面遇到了困难。任何帮助都是很好的。
2条答案
按热度按时间zqdjd7g91#
self.db.openings.find({' quote_text': item['quote_text']})
。我想它应该只是'quote_text'?is True
而不是== True
,这就是它再次添加所有内容的原因。self.db[self.collection_name].update({'quote_text': quote_text}, dict(item),upsert=True)
axr492tv2#
步骤:
1.检查集合是否为空否则:在集合中写入
1.如果不为空且存在项:通过
密码: