我使用Scrapy从一个网站(www.imensa.de)获取数据,我需要将这些数据/项目持久化到一个sqlite3数据库中。Spider工作正常,但我认为管道有问题。
在管道中,使用指定的表和列正确创建了新的. db文件。但未填充该文件。
蜘蛛.py
# Import library
from __future__ import absolute_import
import scrapy
from scrapy.loader import Item
from scrapy.loader import ItemLoader
from ..items import Mensen_Table
from ..items import Mensa_Meal
import datetime
from datetime import date
from datetime import timedelta
global base_url
global meal_plan_day_list
meal_plan_day_list = []
# Create Spider class
class mensen_crawler(scrapy.Spider):
# Name of spider
name = 'mensen'
# Website to scrape
allowed_domains = ['imensa.de']
start_urls = ['https://www.imensa.de/']
# STATE_NAME / STATE_LINK
def parse(self, response):
base_url = 'https://www.imensa.de/'
# Loop through response to parse state_name & state_link
for i, (name, link) in enumerate(zip(response.css('.group a::text'),response.css('.group a::attr(href)'))):
# Condition to only get german states (16 states, index starting from 0 -> 15)
if i < 16:
state_name = name.get()
state_link = base_url + link.get()
yield scrapy.Request(state_link, callback=self.parse_layer2, cb_kwargs={'state_name': state_name, 'state_link':state_link})
# CITY_NAME / CITY_LINK
def parse_layer2(self, response, state_name, state_link):
global base_url
base_url = 'https://www.imensa.de/'
for (city, link) in zip(response.css('.group a::text'), response.css('.group a::attr(href)')):
city_name = city.get()
yield print('current_city: ',city_name,' (state: ',state_name,')')
city_link_part = link.get().split('/')[0]
yield print('city_link_part: ', city_link_part)
city_link_real = base_url + city_link_part + '/index.html'
yield scrapy.Request(city_link_real, callback=self.parse_layer3, cb_kwargs={'state_name': state_name, 'state_link':state_link, 'city_name': city_name,'city_link': city_link_real})
# MENSA_NAME/MENSA_LINK
def parse_layer3(self, response, state_name, state_link, city_name, city_link):
base_url = 'https://www.imensa.de/'
for group in response.css('.group'):
uni_name = group.css('h2::text').get()
yield print('UNI_NAME: ',uni_name)
for mensa in group.css('.element'):
mensa_name = mensa.css('a::text').get()
yield print('mensa_name: ',mensa_name,' (state: ',state_name,') (city: ',city_name,') (uni_name: ',uni_name,')')
mensa_link = base_url + city_link.replace('https://www.imensa.de/','').split('/')[0] + '/' + mensa.css('a::attr(href)').get()
yield print('mensa_link: ', mensa_link)
yield scrapy.Request(mensa_link, callback=self.parse_layer4, cb_kwargs={'state_name': state_name,
'state_link':state_link,
'city_name': city_name,
'city_link': city_link,
'uni_name': uni_name,
'mensa_name': mensa_name,
'mensa_link': mensa_link
})
# CREATE MENSA ITEM -----------------------------------------------------------------------------------------------------------------
def parse_layer4(self, response, state_name, state_link, city_name, city_link, uni_name, mensa_name, mensa_link):
l = ItemLoader(item=Mensen_Table(), response=response)
try:
rating_avg = response.css('.aw-ratings-average::text').get()
except:
rating_avg = 0
try:
rating_count = response.css('.aw-ratings-count::text').get()
except:
rating_count = 0
address_list = []
for address_element in response.css('a.panel-body::text'):
address_list.append(address_element.get())
mensa_location = ', '.join(address_list)
yield print('mensa_location: ', mensa_location)
yield print('parse_layer_3 -- rating_avg: ',rating_avg)
yield print('parse_layer_3 -- rating_count: ', rating_count)
l.add_value('state_name', state_name)
l.add_value('state_link', state_link)
l.add_value('city_name', city_name)
l.add_value('city_link', city_link)
l.add_value('uni_name', uni_name)
l.add_value('mensa_name', mensa_name)
l.add_value('mensen_link', mensa_link)
l.add_value('mensa_address', mensa_location)
l.add_value('mensen_rating_avg', rating_avg)
l.add_value('mensen_rating_count', rating_count)
yield l.load_item()
for i,x in enumerate(response.css('.col-md-4.no-padding-xs .list-group')):
if i == 0:
date_list = x.css('.pull-right::text').extract()
day_list = x.css('a::text').extract()
link_list = x.css('a::attr(href)').extract()
yield print('date_list: ',date_list)
yield print('day_list: ', day_list)
yield print('link_list: ',link_list)
yield print('mensa_link: ',mensa_link)
# PROCESS DATE LIST
#------------------------------------------
meal_plan_date_list = []
for ele in date_list:
if ele == 'heute':
today = datetime.date.today().strftime('%d.%m.%Y')
meal_plan_date_list.append(today)
elif ele == 'morgen':
today = datetime.date.today()
tomorrow = today + datetime.timedelta(days=1)
meal_plan_date_list.append(tomorrow.strftime('%d.%m.%Y'))
else:
meal_plan_date_list.append(ele)
yield print('meal_plan_date_list: ',meal_plan_date_list)
# PROCESS LINK LIST
#--------------------------------------------
meal_plan_link_list = []
for ele in day_list:
link = mensa_link.replace('index.html','') + ele.lower() + '.html'
meal_plan_link_list.append(link)
yield print('meal_plan_link_list: ',meal_plan_link_list)
#meal_plan_list = []
#meal_plan_prep_list = zip(meal_plan_date_list, day_list, meal_plan_link_list)
#for item in meal_plan_prep_list:
# yield print('meal_plan_list_item: ', item)
for date, day, link in zip(meal_plan_date_list, day_list, meal_plan_link_list):
yield scrapy.Request(link, callback=self.parse_layer5, cb_kwargs={'mensa_name': mensa_name, 'mensa_link': link, 'day': day, 'date': date})
# PARSE MEAL PLAN --------------------------------------------------------------------------
def parse_layer5(self, response, mensa_name, mensa_link, day, date):
for element in response.css('.aw-meal-category'):
for sub in element.css('.aw-meal.row.no-margin-xs'):
l = ItemLoader(item=Mensa_Meal(),response=response,selector=sub)
meal_name = sub.css('p.aw-meal-description::text').get()
try:
meal_price = sub.css('.col-sm-2.no-padding-xs.aw-meal-price::text').get().replace('€','').strip()
except:
meal_price = 0
try:
meal_attributes = sub.css('.small.aw-meal-attributes span::text').extract_first().replace(u'\xa0', u'')
except:
meal_attributes = ''
#if not meal_price == None:
l.add_value('mensa_name', mensa_name)
l.add_value('date_of_meal_plan', date)
l.add_value('meal_name', meal_name)
l.add_value('meal_attributes', meal_attributes)
l.add_value('meal_price', meal_price)
yield l.load_item()
项目.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
from itemadapter import ItemAdapter
from scrapy.loader import ItemLoader
from itemloaders.processors import TakeFirst, MapCompose
from w3lib.html import remove_tags
from scrapy.item import Item
class Mensen_Table(scrapy.Item):
state_name = scrapy.Field()
state_link = scrapy.Field()
city_name = scrapy.Field()
city_link = scrapy.Field()
uni_name = scrapy.Field()
mensa_name = scrapy.Field()
mensen_link = scrapy.Field()
mensa_address = scrapy.Field()
mensen_rating_avg = scrapy.Field()
mensen_rating_count = scrapy.Field()
five_star_ratings = scrapy.Field()
four_star_ratings = scrapy.Field()
three_star_ratings = scrapy.Field()
two_star_ratings = scrapy.Field()
one_star_ratings = scrapy.Field()
class Mensa_Meal(scrapy.Item):
mensa_name = scrapy.Field()
date_of_meal_plan = scrapy.Field()
meal_name = scrapy.Field()
meal_attributes = scrapy.Field()
meal_price = scrapy.Field()
管道.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy import item
from scrapy.exceptions import DropItem
from itemadapter import ItemAdapter
import sqlite3
# useful for handling different item types with a single interface
from items import Mensen_Table, Mensa_Meal
class IwCrawlerPipeline:
def __init__(self):
self.con = sqlite3.connect('imensa.db')
self.cur = self.con.cursor()
self.create_table_mensen()
self.create_table_meal()
def create_table_mensen(self):
print('TABLE MENSEN CREATED')
self.cur.execute("""CREATE TABLE IF NOT EXISTS mensen_table (
state_name TEXT,
city_name TEXT,
uni_name TEXT,
mensa_name TEXT,
mensa_address LONG,
mensen_rating_avg FLOAT,
mensen_rating_count TEXT)
""")
def create_table_meal(self):
return print('TABLE MEAL CREATED')
self.cur.execute("""CREATE TABLE IF NOT EXISTS meal_table (
mensa_name TEXT,
date_of_meal_plan DATE,
meal_name LONG,
meal_attributes LONG,
meal_price FLOAT)
""")
def process_item(self, item, spider):
if isinstance(item, Mensen_Table):
print('MENSEN TABLE ITEM PROCESSED')
self.cur.execute("""INSERT INTO mensen_table (state_name, city_name, uni_name, mensa_name, mensa_address, mensen_rating_avg, mensen_rating_count) VALUES (?, ?, ?, ?, ?, ?, ?)""",
(item['state_name'], item['city_name'], item['uni_name'], item['mensa_name'], item['mensa_address'], item['mensen_rating_avg'], item['mensen_rating_count']))
self.con.commit()
return item
if isinstance(item, Mensa_Meal):
print('MEAL TABLE ITEM PROCESSED')
self.cur.execute("""INSERT INTO meal_table (mensa_name, date_of_meal_plan, meal_name, meal_attributes, meal_price) VALUES (?, ?, ?, ?, ?)""",
(item['mensa_name'], item['date_of_meal_plan'], item['meal_name'], item['meal_attributes'], item['meal_price']))
self.con.commit()
return item
我做错了什么?项目得到正确显示,但没有到达数据库文件。
任何帮助都将不胜感激!
1条答案
按热度按时间kuhbmx9i1#
我看到的唯一问题是你从来没有关闭到数据库的连接,尽管我不确定这是否能解决你的问题。
在管道中添加close_spider方法,该方法关闭与数据库的连接。该方法由scrapy引擎在删除spider并关闭twisted reactor之前自动触发。