我从Meta刮工程博客。现在我只是想打印每个博客的标题和网址。感谢任何帮助
这是我所做的。它没有到达parse_loadmore函数,也没有打印任何东西。我尝试将loadmore_endpoint复制并粘贴到浏览器,它工作正常,这应该是一些html代码。
import scrapy
from urllib.parse import urlencode
import pdfkit
import requests
import re
import json
from bs4 import BeautifulSoup
# from ..helpers import generate_pdfs_file_path
options = {
# 'no-images': None,
"disable-javascript": None,
"disable-external-links": None,
"quiet": None,
"encoding": "UTF-8",
}
class MetaSpider(scrapy.Spider):
name = "meta_spider"
api_endpoint = "https://engineering.fb.com/wp-json/fb/v1/loadmore"
start_urls = [
"https://engineering.fb.com/category/core-infra/",
# "https://engineering.fb.com/category/data-infrastructure/",
# "https://engineering.fb.com/category/developer-tools/",
# "https://engineering.fb.com/category/production-engineering/",
# "https://engineering.fb.com/category/security/",
]
post_fetched = 0
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, self.parse_initial)
def parse_initial(self, response):
endpoint, query_args = get_loadmore_endpoints_and_params(response)
for page in range(4):
params = {
"action": "loadmore",
"queryArgs": json.dumps(query_args),
"page": page,
"post_type": "post",
}
loadmore_endpoint = get_load_more_posts_url(endpoint, params=params)
# print(f"Sending Request {loadmore_endpoint}")
yield scrapy.Request(url=loadmore_endpoint, callback=self.parse_loadmore)
def parse_loadmore(self, response):
print("parse_loadmore called with response: {}".format(response.text))
# Create a TextResponse object
for post in response.css("article.post"):
header = post.css("header.entry-header")
title = header.css(".entry-title a::text").get().strip()
url = header.css(".entry-title a::attr(href)").get()
# Sanitize the title to create a valid filename
safe_title = re.sub(r"[^\w\s-]", "", title).replace(" ", "_")
print(f"----title: {safe_title}, url: {url}----")
def clean_post_html(soup):
for script in soup.find_all("script"):
script.decompose()
for script in soup.find_all("noscript"):
script.decompose()
for element in soup.find_all(class_="sharedaddy"):
element.decompose()
image_container = soup.find(id="post-feat-image-container")
if image_container:
image_container.decompose()
def get_loadmore_endpoints_and_params(response):
# Extracting the script content
script_content = response.xpath(
'//script[contains(., "loadmore_params")]/text()'
).get()
# Parsing the JavaScript to extract query parameters
if script_content:
# Use regular expression to find the JSON object
params_json = re.search(r"var loadmore_params = (.*?);", script_content)
if params_json:
params_string = params_json.group(1)
params = json.loads(params_string)
return params["restfulURL"], params["posts"]
def get_load_more_posts_url(url, params):
query_string = urlencode(params, doseq=True)
return f"{url}?{query_string}"
字符串
1条答案
按热度按时间mu0hgdu01#
要实现你的目标,需要做两件事。
1.在
settings.py
或spidercustom_settings
属性中,将默认值"URLLENGTH_LIMIT"
设置为比默认值更高的值-原因是load more端点是一个非常长的URL,并且超过了scrappy默认设置的限制response.json()
来获取文本,然后手动将文本粘贴到scrapy.Selector
中,并使用它在字符串中的html上运行css和xpath查询。举例来说:
字符串
部分输出
型