我想刮从这个网站的文章,包括功能视频网址或图像网址与所有段落,标题在奥德尔使用scrapy在Python

wyyhbhjk  于 2022-11-09  发布在  Python
关注(0)|答案(1)|浏览(122)

我想从这个网站刮文章,包括功能视频的URL或图像的URL与所有段落,和标题的顺序,除了文本不相关的文章,但 Package 在主要文章div类,通过使用python中的scrapy.总之,我是未能获得功能视频的URL或功能图像的URL从这篇文章,并得到了问题,而文本从这篇文章.

from urllib.parse import urljoin
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from datetime import datetime
import pandas as pd

class NewsSpider(scrapy.Spider):
    name = "travelandleisure"

    def start_requests(self):
        url = input("Enter the article url: ")

        yield scrapy.Request(url, callback=self.parse_dir_contents)

    def parse_dir_contents(self, response):
        try:
            Author = ', '.join(set([x.strip() for x in response.xpath('//a[@class="mntl-attribution__item-name"]/text()').extract()]))
        except IndexError:
            Author = "NULL"

        url = response.url

        try:
            Category = response.xpath('//*[@id="mntl-text-link_1-0"]/span/text()').get()
        except IndexError:
            Category = "NULL"

        Headlines = response.xpath('//*[@id="article-heading_1-0"]/text()').get().replace("\n","")

        Source = response.xpath('//*[@id="mntl-text-block_1-0"]/text()').get().replace("\n", "")

        Published_Date = response.css('div.mntl-attribution__item-date::text').get().split("on ")[1].replace(",","")#Updated on June 8, 2022
        Published_Date = datetime.strptime(Published_Date, "%B %d %Y").date()

        #================Waiting for Stack answer====================

        Feature_Image = "NULL" #Please tell the code for feature Image or Video

        Content =  "NULL" #Please tell the code for Scrape the all text, paragraph and heading but in sorting as in the article but not include the text that not belong to article but wrapped in this article div " <div class="loc article-content"> "

        yield{
            'Category':Category,
            'Headlines':Headlines,
            'Author': Author,
            'Source': Source,
            'Publication Date': Published_Date,
            'Feature_Image': Feature_Image,
            'Skift Take': skift_take,
            'Article Content': Content
        }
            # =============== Data Store +++++++++++++++++++++
        Data = [[Category,Headlines,Author,Source,Published_Date,Feature_Image,Content,url]]

        cols = ['Category','Headlines','Author','Source','Published_Date','Feature_Image','Content','URL']
        try:
            opened_df = pd.read_csv('C:/Users/Public/pagedata.csv')
            opened_df = pd.concat([opened_df,pd.DataFrame(Data, columns = cols)])
        except:
            opened_df = pd.DataFrame(Data, columns = cols)

        opened_df.to_csv('C:/Users/Public/pagedata.csv', index= False) 

if __name__ == '__main__':
    settings = get_project_settings()
    process = CrawlerProcess(settings)
    process.crawl(NewsSpider)

    process.start()

这是网站URL https://www.travelandleisure.com/travel-news/where-can-americans-travel-right-now-a-country-by-country-guide

i34xakig

i34xakig1#

下面给出了一种可能的解决方案。
仅对于视频链接/视频图像url完全依赖于JavaScript,而Scrapy无法渲染JS,这就是为什么不能仅使用Scrapy抓取视频图像url的原因。

import scrapy
import datetime
import pandas as pd
class NewsSpider(scrapy.Spider):
    name = "articles"
    def start_requests(self):
        #https://www.travelandleisure.com/travel-news/where-can-americans-travel-right-now-a-country-by-country-guide

        url = input("Enter the article url: ")

        yield scrapy.Request(url, callback=self.parse_dir_contents)

    def parse_dir_contents(self, response):

        data = []

        Author = ', '.join(set([x.strip() for x in response.xpath('//a[@class="mntl-attribution__item-name"]/text()').extract()]))
        Category = response.xpath('//*[@id="mntl-text-link_1-0"]/span/text()').get()
        Headlines = response.xpath('//*[@id="article-heading_1-0"]/text()').get().replace("\n","")
        Source = response.xpath('//*[@id="mntl-text-block_1-0"]/text()').get().replace("\n", "")
        Published_Date = response.css('div.mntl-attribution__item-date::text').get().split("on ")[1].replace(",","")#Updated on June 8, 2022
        Published_Date = datetime.datetime.strptime(Published_Date, "%B %d %Y").date()
        #print(Author,Category,Source,Headlines, Published_Date)
        #Feature_Image = "NULL" [only for video]because video link/entirely depends on JavaScript and scrapy can't render JS that's why it's not possible to grab  video image url using scrapy only
        Feature_Images = ''.join([x.get() for x in response.xpath('//*[@class="img--noscript universal-image__image"]/@src')][:-1])
        #print(Feature_Images)
        Content = ''.join(response.xpath('//*[@id="mntl-sc-page_1-0"]//text()').getall()).strip()
        #print(Content)

        d = {
            'Category':Category,
            'Headlines':Headlines,
            'Author': Author,
            'Source': Source,
            'Publication Date': Published_Date,
            'Feature_Image': Feature_Images,
            'Article Content': Content
            }
        #yield d
        data.append(d)

        df = pd.DataFrame(data).to_csv('out.csv',index=False)

        #print(df)

相关问题