Python Scrapy我无法获取任何数据

vm0i2vca  于 2022-11-09  发布在  Python
关注(0)|答案(1)|浏览(176)
from urllib import parse
import scrapy
from scrapy.linkextractors import LinkExtractor
import codecs
import json

class WanikaniSpider(scrapy.Spider):
    name = 'japandict'
    allowed_domains = ['www.wanikani.com']         
    url = ('https://www.wanikani.com/kanji/')
    start_urls = []
    kanjis = ['悪' ,'安' ,'以' ,'飲' ,'意' ,'医' ,'員' ,'運' ,'映' ,'英' ,'屋' ,'駅' ,'歌' ,'牛']
    liste=[]
    for kanji in kanjis:
        liste.append(kanji)
        nurl = url + kanji
        start_urls.append(nurl)
    file =  open("kanji.txt","a",encoding="utf-8")
    file1 = open("onyomi.txt","a",encoding="utf-8")
    file2 = open("kunyomi.txt","a",encoding="utf-8") 
    file3 = open("meanings.txt","a",encoding="utf-8")       

    def parse(self, response):
        print(response.url)
        kanjiicon = response.xpath('//*[@id="main"]/body/div[1]/div[3]/div/div/header/h1/span/text()').getall()
        meanings = response.xpath('//*[@id="main"]/body/div[1]/div[3]/div/div/header/h1/text()').getall()
        reading = response.xpath('//*[@id="reading"]/div') 
        for onkun in reading:
            onyomi= onkun.xpath('//*[@id="reading"]/div/div[1]/p/text()').getall()
            kunyomi= onkun.xpath('//*[@id="reading"]/div/div[2]/p/text()').getall()                
        for x in onyomi:
            x.strip()
            self.file1.write(x + "\n")
            self.file1.close
        for y in kanjiicon:
            self.file.write(y + "\n")
            self.file.close
        for z in kunyomi:
            self.file2.write(z + "\n")
            self.file.close
        for p in meanings:
            self.file3.write(p + "\n")
            self.file.close

汉字是日本字符,有一个onyomi和kunyomi阅读。我想得到这个阅读和汉字的意义,并写在文本文件。所以有一个网站,我可以这样做。它创建txt文件,但它的空。

ippsafx7

ippsafx71#

我发现你的代码有一些问题。我不确定这是否是让你的项目正常工作所需要的全部,但一个主要的问题是你如何打开和关闭文件。现在你在类定义中打开它们,然后在每个请求中关闭它们。这意味着在第一次调用parse后,你的文件已经关闭,不再是可写的。你应该做的是使用零碎的项目管道来引导输出和将数据写入文件。例如:
在您的spider文件中:

import scrapy

class WanikaniSpider(scrapy.Spider):
    name = 'japandict'
    allowed_domains = ['www.wanikani.com']
    url = ('https://www.wanikani.com/kanji/')
    start_urls = []
    kanjis = ['悪' ,'安' ,'以' ,'飲' ,'意' ,'医' ,'員' ,'運' ,'映' ,'英' ,'屋' ,'駅' ,'歌' ,'牛']
    liste=[]
    for kanji in kanjis:
        liste.append(kanji)
        nurl = url + kanji
        start_urls.append(nurl)

    def parse(self, response):
        kanjiicon = response.xpath('//*[@id="main"]/body/div[1]/div[3]/div/div/header/h1/span/text()').getall()
        meanings = response.xpath('//*[@id="main"]/body/div[1]/div[3]/div/div/header/h1/text()').getall()
        for y in kanjiicon:
            yield {"kanji": y.strip()}
        for p in meanings:
            yield {"meanings": p.strip()}
        reading = response.xpath('//*[@id="reading"]/div')
        for onkun in reading:
            onyomi= onkun.xpath('//*[@id="reading"]/div/div[1]/p/text()').getall()
            kunyomi= onkun.xpath('//*[@id="reading"]/div/div[2]/p/text()').getall()
            for x in onyomi:
                yield {"onyomi": x.strip()}
            for z in kunyomi:
                yield {"kunyomi": z.strip()}

然后在您的pipelines.py文件中

class SpidersPipeline:
    def process_item(self, item, spider):
        for i, kw in enumerate(["kanji","onyomi","kunyomi","meanings"]):
            if kw in item:
                self.files[i].write(item[kw] + "\n")

    def open_spider(self, spider):
        self.files = [open(x, "a", encoding="utf-8") for x in [
                      "kanji.txt", "onyomi.txt", "kunyomi.txt", 
                      "meanings.txt"]]

    def close_spider(self, spider):
        list(map(lambda x: x.close(), self.files))

并记住取消注解settings.py文件中的管道

ITEM_PIPELINES = {
   'spiders.pipelines.SpidersPipeline': 300,   # <- make sure it is uncommented
}

相关问题