Scrapy Item Pipeline未将项目处理为sqlite

ncgqoxb0  于 2022-11-09  发布在  SQLite
关注(0)|答案(1)|浏览(150)

我使用Scrapy从一个网站(www.imensa.de)获取数据,我需要将这些数据/项目持久化到一个sqlite3数据库中。Spider工作正常,但我认为管道有问题。
在管道中,使用指定的表和列正确创建了新的. db文件。但未填充该文件。

蜘蛛.py


# Import library

from __future__ import absolute_import
import scrapy
from scrapy.loader import Item
from scrapy.loader import ItemLoader
from ..items import Mensen_Table
from ..items import Mensa_Meal
import datetime
from datetime import date
from datetime import timedelta

global base_url
global meal_plan_day_list

meal_plan_day_list = []

# Create Spider class

class mensen_crawler(scrapy.Spider):
    # Name of spider
    name = 'mensen'

    # Website to scrape
    allowed_domains = ['imensa.de']
    start_urls = ['https://www.imensa.de/']

    # STATE_NAME / STATE_LINK
    def parse(self, response):

        base_url = 'https://www.imensa.de/'

        # Loop through response to parse state_name & state_link
        for i, (name, link) in enumerate(zip(response.css('.group a::text'),response.css('.group a::attr(href)'))):

            # Condition to only get german states (16 states, index starting from 0 -> 15)
            if i < 16:
                state_name = name.get()
                state_link = base_url + link.get()

                yield scrapy.Request(state_link, callback=self.parse_layer2, cb_kwargs={'state_name': state_name, 'state_link':state_link})

    # CITY_NAME / CITY_LINK
    def parse_layer2(self, response, state_name, state_link):
        global base_url

        base_url = 'https://www.imensa.de/'

        for (city, link) in zip(response.css('.group a::text'), response.css('.group a::attr(href)')):
            city_name = city.get()

            yield print('current_city: ',city_name,'  (state: ',state_name,')')
            city_link_part = link.get().split('/')[0]
            yield print('city_link_part: ', city_link_part)
            city_link_real = base_url + city_link_part + '/index.html'

            yield scrapy.Request(city_link_real, callback=self.parse_layer3, cb_kwargs={'state_name': state_name, 'state_link':state_link, 'city_name': city_name,'city_link': city_link_real})

    # MENSA_NAME/MENSA_LINK
    def parse_layer3(self, response, state_name, state_link, city_name, city_link):

        base_url = 'https://www.imensa.de/'

        for group in response.css('.group'):
            uni_name = group.css('h2::text').get()

            yield print('UNI_NAME: ',uni_name)

            for mensa in group.css('.element'):

                mensa_name = mensa.css('a::text').get()
                yield print('mensa_name: ',mensa_name,' (state: ',state_name,') (city: ',city_name,') (uni_name: ',uni_name,')')
                mensa_link = base_url + city_link.replace('https://www.imensa.de/','').split('/')[0] + '/' + mensa.css('a::attr(href)').get()
                yield print('mensa_link: ', mensa_link)

                yield scrapy.Request(mensa_link, callback=self.parse_layer4, cb_kwargs={'state_name': state_name,
                                                                                       'state_link':state_link,
                                                                                       'city_name': city_name,
                                                                                       'city_link': city_link,
                                                                                       'uni_name': uni_name,
                                                                                       'mensa_name': mensa_name,
                                                                                       'mensa_link': mensa_link
                                                                                        })

     # CREATE MENSA ITEM -----------------------------------------------------------------------------------------------------------------
    def parse_layer4(self, response, state_name, state_link, city_name, city_link, uni_name, mensa_name, mensa_link):

        l = ItemLoader(item=Mensen_Table(), response=response)

        try:
            rating_avg = response.css('.aw-ratings-average::text').get()
        except:
            rating_avg = 0

        try:
            rating_count = response.css('.aw-ratings-count::text').get()
        except:
            rating_count = 0

        address_list = []
        for address_element in response.css('a.panel-body::text'):
            address_list.append(address_element.get())
        mensa_location = ', '.join(address_list)
        yield print('mensa_location: ', mensa_location)

        yield print('parse_layer_3 -- rating_avg: ',rating_avg)
        yield print('parse_layer_3 -- rating_count: ', rating_count)
        l.add_value('state_name', state_name)
        l.add_value('state_link', state_link)
        l.add_value('city_name', city_name)
        l.add_value('city_link', city_link)
        l.add_value('uni_name', uni_name)
        l.add_value('mensa_name', mensa_name)
        l.add_value('mensen_link', mensa_link)
        l.add_value('mensa_address', mensa_location)
        l.add_value('mensen_rating_avg', rating_avg)
        l.add_value('mensen_rating_count', rating_count)
        yield l.load_item()

        for i,x in enumerate(response.css('.col-md-4.no-padding-xs .list-group')):
            if i == 0:
                date_list = x.css('.pull-right::text').extract()
                day_list = x.css('a::text').extract()
                link_list = x.css('a::attr(href)').extract()
                yield print('date_list: ',date_list)
                yield print('day_list: ', day_list)
                yield print('link_list: ',link_list)

        yield print('mensa_link: ',mensa_link)

        # PROCESS DATE LIST
        #------------------------------------------

        meal_plan_date_list = []
        for ele in date_list:
            if ele == 'heute':
                today = datetime.date.today().strftime('%d.%m.%Y')
                meal_plan_date_list.append(today)
            elif ele == 'morgen':
                today = datetime.date.today()
                tomorrow = today + datetime.timedelta(days=1)
                meal_plan_date_list.append(tomorrow.strftime('%d.%m.%Y'))
            else:
                meal_plan_date_list.append(ele)
        yield print('meal_plan_date_list: ',meal_plan_date_list)

        # PROCESS LINK LIST
        #--------------------------------------------
        meal_plan_link_list = []
        for ele in day_list:
                link = mensa_link.replace('index.html','') + ele.lower() + '.html'
                meal_plan_link_list.append(link)
        yield print('meal_plan_link_list: ',meal_plan_link_list)

        #meal_plan_list = []
        #meal_plan_prep_list = zip(meal_plan_date_list, day_list, meal_plan_link_list)
        #for item in meal_plan_prep_list:
        #    yield print('meal_plan_list_item: ', item)

        for date, day, link in zip(meal_plan_date_list, day_list, meal_plan_link_list):
            yield scrapy.Request(link, callback=self.parse_layer5, cb_kwargs={'mensa_name': mensa_name, 'mensa_link': link, 'day': day, 'date': date})

    # PARSE MEAL PLAN --------------------------------------------------------------------------
    def parse_layer5(self, response, mensa_name, mensa_link, day, date):
        for element in response.css('.aw-meal-category'):
            for sub in element.css('.aw-meal.row.no-margin-xs'):
                l = ItemLoader(item=Mensa_Meal(),response=response,selector=sub)

                meal_name = sub.css('p.aw-meal-description::text').get()

                try:
                    meal_price = sub.css('.col-sm-2.no-padding-xs.aw-meal-price::text').get().replace('€','').strip()
                except:
                    meal_price = 0

                try:
                    meal_attributes = sub.css('.small.aw-meal-attributes span::text').extract_first().replace(u'\xa0', u'')
                except:
                    meal_attributes = ''

                #if not meal_price == None:

                l.add_value('mensa_name', mensa_name)
                l.add_value('date_of_meal_plan', date)
                l.add_value('meal_name', meal_name)
                l.add_value('meal_attributes', meal_attributes)
                l.add_value('meal_price', meal_price)
                yield l.load_item()

项目.py


# Define here the models for your scraped items

# 

# See documentation in:

# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy
from itemadapter import ItemAdapter
from scrapy.loader import ItemLoader
from itemloaders.processors import TakeFirst, MapCompose
from w3lib.html import remove_tags
from scrapy.item import Item

class Mensen_Table(scrapy.Item):
    state_name = scrapy.Field()
    state_link = scrapy.Field()
    city_name = scrapy.Field()
    city_link = scrapy.Field()
    uni_name = scrapy.Field()
    mensa_name = scrapy.Field()
    mensen_link = scrapy.Field()
    mensa_address = scrapy.Field()
    mensen_rating_avg = scrapy.Field()
    mensen_rating_count = scrapy.Field()
    five_star_ratings = scrapy.Field()
    four_star_ratings = scrapy.Field()
    three_star_ratings = scrapy.Field()
    two_star_ratings = scrapy.Field()
    one_star_ratings = scrapy.Field()

class Mensa_Meal(scrapy.Item):
    mensa_name = scrapy.Field()
    date_of_meal_plan = scrapy.Field()
    meal_name = scrapy.Field()
    meal_attributes = scrapy.Field()
    meal_price = scrapy.Field()

管道.py


# Define your item pipelines here

# 

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

from scrapy import item
from scrapy.exceptions import DropItem
from itemadapter import ItemAdapter
import sqlite3

# useful for handling different item types with a single interface

from items import Mensen_Table, Mensa_Meal

class IwCrawlerPipeline:
    def __init__(self):

        self.con = sqlite3.connect('imensa.db')

        self.cur = self.con.cursor()
        self.create_table_mensen()
        self.create_table_meal()

    def create_table_mensen(self):

        print('TABLE MENSEN CREATED')

        self.cur.execute("""CREATE TABLE IF NOT EXISTS mensen_table (
                state_name TEXT,
                city_name TEXT,
                uni_name TEXT,
                mensa_name TEXT,
                mensa_address LONG,
                mensen_rating_avg FLOAT,
                mensen_rating_count TEXT)
                """)

    def create_table_meal(self):

        return print('TABLE MEAL CREATED')

        self.cur.execute("""CREATE TABLE IF NOT EXISTS meal_table (
                mensa_name TEXT,
                date_of_meal_plan DATE,
                meal_name LONG,
                meal_attributes LONG,
                meal_price FLOAT)
                """)

    def process_item(self, item, spider):

        if isinstance(item, Mensen_Table):
            print('MENSEN TABLE ITEM PROCESSED')
            self.cur.execute("""INSERT INTO mensen_table (state_name, city_name, uni_name, mensa_name, mensa_address, mensen_rating_avg, mensen_rating_count) VALUES (?, ?, ?, ?, ?, ?, ?)""",
                                (item['state_name'], item['city_name'], item['uni_name'], item['mensa_name'], item['mensa_address'], item['mensen_rating_avg'], item['mensen_rating_count']))

            self.con.commit()

        return item

        if isinstance(item, Mensa_Meal):
            print('MEAL TABLE ITEM PROCESSED')
            self.cur.execute("""INSERT INTO meal_table (mensa_name, date_of_meal_plan, meal_name, meal_attributes, meal_price) VALUES (?, ?, ?, ?, ?)""",
                                (item['mensa_name'], item['date_of_meal_plan'], item['meal_name'], item['meal_attributes'], item['meal_price']))

            self.con.commit()

        return item

我做错了什么?项目得到正确显示,但没有到达数据库文件。
任何帮助都将不胜感激!

kuhbmx9i

kuhbmx9i1#

我看到的唯一问题是你从来没有关闭到数据库的连接,尽管我不确定这是否能解决你的问题。
在管道中添加close_spider方法,该方法关闭与数据库的连接。该方法由scrapy引擎在删除spider并关闭twisted reactor之前自动触发。

class IwCrawlerPipeline:
    def __init__(self):

        self.con = sqlite3.connect('imensa.db')

        self.cur = self.con.cursor()
        self.create_table_mensen()
        self.create_table_meal()

    ...

    def close_spider(self, spider):
        self.con.close()

相关问题