我想从这个网站刮文章,包括功能视频的URL或图像的URL与所有段落,和标题的顺序,除了文本不相关的文章,但 Package 在主要文章div类,通过使用python中的scrapy.总之,我是未能获得功能视频的URL或功能图像的URL从这篇文章,并得到了问题,而文本从这篇文章.
from urllib.parse import urljoin
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from datetime import datetime
import pandas as pd
class NewsSpider(scrapy.Spider):
name = "travelandleisure"
def start_requests(self):
url = input("Enter the article url: ")
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
try:
Author = ', '.join(set([x.strip() for x in response.xpath('//a[@class="mntl-attribution__item-name"]/text()').extract()]))
except IndexError:
Author = "NULL"
url = response.url
try:
Category = response.xpath('//*[@id="mntl-text-link_1-0"]/span/text()').get()
except IndexError:
Category = "NULL"
Headlines = response.xpath('//*[@id="article-heading_1-0"]/text()').get().replace("\n","")
Source = response.xpath('//*[@id="mntl-text-block_1-0"]/text()').get().replace("\n", "")
Published_Date = response.css('div.mntl-attribution__item-date::text').get().split("on ")[1].replace(",","")#Updated on June 8, 2022
Published_Date = datetime.strptime(Published_Date, "%B %d %Y").date()
#================Waiting for Stack answer====================
Feature_Image = "NULL" #Please tell the code for feature Image or Video
Content = "NULL" #Please tell the code for Scrape the all text, paragraph and heading but in sorting as in the article but not include the text that not belong to article but wrapped in this article div " <div class="loc article-content"> "
yield{
'Category':Category,
'Headlines':Headlines,
'Author': Author,
'Source': Source,
'Publication Date': Published_Date,
'Feature_Image': Feature_Image,
'Skift Take': skift_take,
'Article Content': Content
}
# =============== Data Store +++++++++++++++++++++
Data = [[Category,Headlines,Author,Source,Published_Date,Feature_Image,Content,url]]
cols = ['Category','Headlines','Author','Source','Published_Date','Feature_Image','Content','URL']
try:
opened_df = pd.read_csv('C:/Users/Public/pagedata.csv')
opened_df = pd.concat([opened_df,pd.DataFrame(Data, columns = cols)])
except:
opened_df = pd.DataFrame(Data, columns = cols)
opened_df.to_csv('C:/Users/Public/pagedata.csv', index= False)
if __name__ == '__main__':
settings = get_project_settings()
process = CrawlerProcess(settings)
process.crawl(NewsSpider)
process.start()
这是网站URL https://www.travelandleisure.com/travel-news/where-can-americans-travel-right-now-a-country-by-country-guide
1条答案
按热度按时间i34xakig1#
下面给出了一种可能的解决方案。
仅对于视频链接/视频图像url完全依赖于JavaScript,而Scrapy无法渲染JS,这就是为什么不能仅使用Scrapy抓取视频图像url的原因。