python BeautifulSoup从谷歌获取“f slp”项目

sg24os4d  于 2023-01-08  发布在  Python
关注(0)|答案(2)|浏览(120)

嗨,伙计们,我正试图从谷歌的一些论文的引用。这是我的代码

import urllib
import mechanize
from bs4 import BeautifulSoup

import csv
import os #change directory
import re #for regular expressions


br = mechanize.Browser()

br.set_handle_equiv(False)
br.set_handle_robots(False)   # ignore robots

br.addheaders = [('User-agent', 'Firefox')]             # [()]
br.open('http://google.com/')

br.select_form(name='f')   # Note: select the form named 'f' here
term = "Multinational Study of the Efficacy and Safety of Humanized Anti-HER2 Monoclonal Antibody in Women Who Have HER2-Overexpressing Metastatic Breast Cancer That Has Progressed After Chemotherapy for Metastatic Disease".replace(" ","+")
br.form['q'] = term # query
data = br.submit()

soup = BeautifulSoup(data)

cite= soup.findAll('div',{'class': 'f slp'})
ref = str(cite[1])
print ref

然而我总是出错。我想知道这篇论文的引用次数。

tp5buhyn

tp5buhyn1#

问题是在表单提交后,您获得的页面上没有引用信息,换句话说,没有f slp类的div
您可以使用以下几个选项来解决此问题:

另见:

希望能有所帮助。

toiithl6

toiithl62#

要从Google获取大量论文的引文,可以使用regular expressions突出显示snippet中的cited by

snippet = result.select_one(".lEBKkf").text
cited_by = re.search(r'Cited by (\d+)', snippet).group()

为了从所有页面收集信息,您需要使用带有while循环的分页。
只要下一个按钮存在(由页面上是否存在按钮选择器决定,在我们的例子中是CSS选择器".d6cvqb a [id = pnnext]",您需要将["start"]的值增加10才能访问下一个页面,如果存在,否则,我们需要退出while循环:

if soup.select_one('.d6cvqb a[id=pnnext]'):
    params["start"] += 10
else:
    break

检查在线IDE中的代码。

from bs4 import BeautifulSoup
import requests, json, re, lxml

query = "Multinational Study of the Efficacy and Safety of Humanized Anti-HER2 Monoclonal Antibody in Women Who Have HER2-Overexpressing Metastatic Breast Cancer That Has Progressed After Chemotherapy for Metastatic Disease"
# https://docs.python-requests.org/en/master/user/quickstart/#passing-parameters-in-urls
params = {
    "q": query,          # query
    "hl": "en",          # language
    "gl": "uk",          # country of the search, UK -> United Kingdom
    "start": 0,          # number page by default up to 0
    #"num": 100          # parameter defines the maximum number of results to return.
}

# https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
}

page_num = 0

citations = []

while True:
    page_num += 1
    print(f"page: {page_num}")
        
    html = requests.get("https://www.google.com/search", params=params, headers=headers, timeout=30)
    soup = BeautifulSoup(html.text, 'lxml')

    for result in soup.select(".tF2Cxc"):
        title = result.select_one(".DKV0Md").text
        try:
            snippet = result.select_one(".lEBKkf").text
        except:
            snippet = None
        try:
            cited_by = re.search(r'Cited by (\d+)', snippet).group()
        except:
            cited_by = None
                    
        citations.append({
              "title": title,
              "snippet": snippet,
              "cited_by": cited_by  
        })
      
    if soup.select_one('.d6cvqb a[id=pnnext]'):
        params["start"] += 10
    else:
        break

print(json.dumps(citations, indent=2, ensure_ascii=False))

输出示例:

[
  {
    "title": "Targeted therapeutic options and future perspectives for ...",
    "snippet": "by J Wang · 2019 · Cited by 238 — Since its launch in 1998, trastuzumab became a therapeutic for breast cancer patients with HER2 overexpression and is widely administrated as ...",
    "cited_by": "Cited by 238"
  },
  {
    "title": "Trastuzumab Regimens for HER2-Overexpressing Metastatic ...",
    "snippet": "by DR Spigel · 2003 · Cited by 30 — Multinational study of the efficacy and safety of humanized anti-HER2 ... breast cancer that has progressed after chemotherapy for metastatic disease.",
    "cited_by": "Cited by 30"
  },
  other results...
]

另外一个解决方案是使用SerpApi的Google Search Engine Results API,这是一个免费的付费API,不同的是它会绕过Google的块(包括CAPTCHA),不需要创建解析器和维护它。
代码示例:

from serpapi import GoogleSearch
import os, json

query = "Multinational Study of the Efficacy and Safety of Humanized Anti-HER2 Monoclonal Antibody in Women Who Have HER2-Overexpressing Metastatic Breast Cancer That Has Progressed After Chemotherapy for Metastatic Disease"

params = {
    "api_key": "...",          # https://serpapi.com/manage-api-key
    "device": "desktop",       # device
    "engine": "google",        # serpapi parser engine
    "q": query,                # query
    "gl": "uk",                # country of the search, UK -> United Kingdom
    "hl": "en"                 # language
}

search = GoogleSearch(params)  # where data extraction happens
pages = search.pagination()

citations = []

for page in pages:  
    for organic_result in page["organic_results"]:
        title = organic_result.get("title")
        snippet = organic_result.get("snippet")
        cited_by =  organic_result.get("rich_snippet", {}).get("top", {}).get("detected_extensions", {}).get("cited_by")

        citations.append({
            "title": title,
            "snippet": snippet,
            "cited_by": cited_by
        })

print(json.dumps(citations, indent=2))

输出:

[
   {
    "title": "Targeting Bcl-2 in Herceptin-Resistant Breast Cancer Cell Lines",
    "snippet": "recombinant humanized anti-HER2 monoclonal antibody approved for treatment of HER2-overexpressing metastatic breast cancer. Clinical studies have shown that ...",
    "cited_by": 71
  },
  {
    "title": "Estabilidad a largo plazo del trastuzumab en plasma y suero ...",
    "snippet": "Multinational study of the efficacy and safety of humanized anti-HER2 monoclonal antibody in women who have HER2-overexpressing metastatic breast cancer ...",
    "cited_by": 1
  }
  other results...
]

相关问题