python 尽管代码正在运行,但json响应为空列表

k4emjkb1  于 2022-11-28  发布在  Python
关注(0)|答案(2)|浏览(92)

我试图运行下面的python脚本来从google scholar中提取数据。然而,当我运行代码时,我得到了一个空列表作为json响应。注意,所有必要的库都已安装。

headers = {
    'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}

params = {
    'q': 'Machine learning',
    'hl': 'en'
}

html = requests.get('https://scholar.google.com/scholar', headers=headers, params=params).text
soup = BeautifulSoup(html, 'lxml')

# JSON data will be collected here
data = []

# Container where all needed data is located
for result in soup.select('.gs_r.gs_or.gs_scl'):
    title = result.select_one('.gs_rt').text
    title_link = result.select_one('.gs_rt a')['href']
    publication_info = result.select_one('.gs_a').text
    snippet = result.select_one('.gs_rs').text
    cited_by = result.select_one('#gs_res_ccl_mid .gs_nph+ a')['href']
    related_articles = result.select_one('a:nth-child(4)')['href']
    try:
        all_article_versions = result.select_one('a~ a+ .gs_nph')['href']
    except:
        all_article_versions = None
    
    try:
        pdf_link = result.select_one('.gs_or_ggsm a:nth-child(1)')['href']
    except: 
        pdf_link = None

    data.append({
        'title': title,
        'title_link': title_link,
        'publication_info': publication_info,
        'snippet': snippet,
        'cited_by': f'https://scholar.google.com{cited_by}',
        'related_articles': f'https://scholar.google.com{related_articles}',
        'all_article_versions': f'https://scholar.google.com{all_article_versions}',
        "pdf_link": pdf_link
    })

print(json.dumps(data, indent = 2, ensure_ascii = False))

输出:[]

1u4esq0p

1u4esq0p1#

你的代码运行的很好,但是问题是把抓取的数据正确的保存在json格式中。所以你可以使用超级强大和简单的工具,PandasDataFrasme来把数据保存在json格式中

from bs4 import BeautifulSoup
import requests
#import json
import pandas as pd

headers = {
    'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}

params = {
    'q': 'Machine learning',
    'hl': 'en'
}

html = requests.get('https://scholar.google.com/scholar', headers=headers, params=params).text
soup = BeautifulSoup(html, 'lxml')
#print(soup.prettify())

# JSON data will be collected here
data = []

# Container where all needed data is located
for result in soup.select('.gs_r.gs_or.gs_scl'):
    title = result.select_one('.gs_rt').text
    title_link = result.select_one('.gs_rt a')['href']
    publication_info = result.select_one('.gs_a').text
    snippet = result.select_one('.gs_rs').text
    cited_by = result.select_one('#gs_res_ccl_mid .gs_nph+ a')['href']
    related_articles = result.select_one('a:nth-child(4)')['href']
    try:
        all_article_versions = result.select_one('a~ a+ .gs_nph')['href']
    except:
        all_article_versions = None
    
    try:
        pdf_link = result.select_one('.gs_or_ggsm a:nth-child(1)')['href']
    except: 
        pdf_link = None

    data.append({
        'title': title,
        'title_link': title_link,
        'publication_info': publication_info,
        'snippet': snippet,
        'cited_by': f'https://scholar.google.com{cited_by}',
        'related_articles': f'https://scholar.google.com{related_articles}',
        'all_article_versions': f'https://scholar.google.com{all_article_versions}',
        "pdf_link": pdf_link
    })

#print(json.dumps(data, indent = 2, ensure_ascii = False))

df = pd.DataFrame(data).to_json('out.json',indent=4)

输出:

{
    "title": {
        "0": "[BOOK][B] Machine learning",
        "1": "[BOOK][B] Machine learning",
        "2": "Machine learning",
        "3": "Machine learning: Trends, perspectives, and prospects",
        "4": "[PDF][PDF] Machine learning algorithms-a review",
        "5": "What is machine learning?",
        "6": "[PDF][PDF] Machine learning basics",
        "7": "What is machine learning? A primer for the epidemiologist",
        "8": "[BOOK][B] Readings in machine learning",
        "9": "[BOOK][B] Encyclopedia of machine learning"
    },
    "title_link": {
        "0": "https:\/\/books.google.com\/books?hl=en&lr=&id=ctM-EAAAQBAJ&oi=fnd&pg=PR6&dq=Machine+learning&ots=oZOqY0Vw_r&sig=Ide7KdAOWXxQwQKPxJKaps4Ag0g",
        "1": "https:\/\/profs.info.uaic.ro\/~ciortuz\/SLIDES\/2017s\/ml0.pdf",
        "2": "https:\/\/www.annualreviews.org\/doi\/pdf\/10.1146\/annurev.cs.04.060190.001351",
        "3": "https:\/\/www.science.org\/doi\/abs\/10.1126\/science.aaa8415",
        "4": "https:\/\/www.researchgate.net\/profile\/Batta-Mahesh\/publication\/344717762_Machine_Learning_Algorithms_-A_Review\/links\/5f8b2365299bf1b53e2d243a\/Machine-Learning-Algorithms-A-Review.pdf?eid=5082902844932096",
        "5": "https:\/\/link.springer.com\/chapter\/10.1007\/978-3-319-18305-3_1",
        "6": "http:\/\/whdeng.cn\/Teaching\/PPT_01_Machine%20learning%20Basics.pdf",
        "7": "https:\/\/academic.oup.com\/aje\/article-abstract\/188\/12\/2222\/5567515",
        "8": "https:\/\/books.google.com\/books?hl=en&lr=&id=UgC33U2KMCsC&oi=fnd&pg=PA1&dq=Machine+learning&ots=Thlmkd7Io7&sig=8wkVF31S9nKRAOY8a-OOF8DWRGI",
        "9": "https:\/\/books.google.com\/books?hl=en&lr=&id=i8hQhp1a62UC&oi=fnd&pg=PT29&dq=Machine+learning&ots=91ogCqhE8N&sig=7yz-s1SuD_e6HZe_-_5jF8lbld8"
    },
    "publication_info": {
        "0": "ZH Zhou - 2021 - books.google.com",
        "1": "TM Mitchell, TM Mitchell - 1997 - profs.info.uaic.ro",
        "2": "TG Dietterich\u00a0- Annual review of computer science, 1990 - annualreviews.org",
        "3": "MI Jordan, TM Mitchell\u00a0- Science, 2015 - science.org",
        "4": "B Mahesh\u00a0- International Journal of Science and Research (IJSR)\u00a0\u2026, 2020 - researchgate.net",
        "5": "I El Naqa, MJ Murphy\u00a0- machine learning in radiation oncology, 2015 - Springer",
        "6": "H Wang, Z Lei, X Zhang, B Zhou, J Peng\u00a0- Deep Learn, 2016 - whdeng.cn",
        "7": "Q Bi, KE Goodman, J Kaminsky\u2026\u00a0- American journal of\u00a0\u2026, 2019 - academic.oup.com",
        "8": "JW Shavlik, T Dietterich, TG Dietterich - 1990 - books.google.com",
        "9": "C Sammut, GI Webb - 2011 - books.google.com"
    },
    "snippet": {
        "0": "\u2026 machine learning. The second part includes Chapters 4\u201310, which presents some classic and \npopular machine learning \u2026 cover the core topics of machine learning in one semester, and \u2026",
        "1": "\u2026 Tom Mitchell (Definition of the [general] learning problem): \u201cA computer program is said \nto learn from experience E with respect to some class of tasks T and performance measure P\u00a0\u2026",
        "2": "Recent progress in the study of machine learning methods has taken many directions. First, \nin the area of inductive learning, a new formal definition of learning introduced by Leslie \u2026",
        "3": "\u2026 Machine learning addresses the question of how to build computers that improve \u2026 Recent \nprogress in machine learning has been driven both by the development of new learning \u2026",
        "4": "\u2026 Here\u201fsa quick look at some of the commonly used algorithms in machine learning (ML) \nSupervised Learning Supervised learning is the machine learning task of learning a function \u2026",
        "5": "\u2026 A machine learning algorithm is a computational process that \u2026 This training is the \u201clearning\u201d \npart of machine learning. The \u2026 can practice \u201clifelong\u201d learning as it processes new data and \u2026",
        "6": "\u2026 To obtain theoretical guarantees about generalization of a machine learning algorithm, we \n\u2026 Why does deep learning have different behavior than other machine learning methods for \u2026",
        "7": "\u2026 We provide a brief introduction to 5 common machine learning \u2026 of machine learning \ntechniques in the published literature. We recommend approaches to incorporate machine learning \u2026",
        "8": "\u2026 in machine learning. We have taught from these readings in our own machine learning \u2026 \nFurthermore, we in machine learning believe that learning techniques provide important con\u2026",
        "9": "\u2026 Machine Learning came to be identified as a research field in \u2026 machine learning appeared. \nAlthough the field coalesced in the \uf6dc\uf641\uf640\uf639s, research on what we now call machine learning \u2026"
    },
    "cited_by": {
        "0": "https:\/\/scholar.google.com\/scholar?cites=3387547533016043281&as_sdt=2005&sciodt=0,5&hl=en",
        "1": "https:\/\/scholar.google.com\/scholar?cites=5160851211484945804&as_sdt=2005&sciodt=0,5&hl=en",
        "2": "https:\/\/scholar.google.com\/scholar?cites=7073378272324684978&as_sdt=2005&sciodt=0,5&hl=en",
        "3": "https:\/\/scholar.google.com\/scholar?cites=10883068066968164261&as_sdt=2005&sciodt=0,5&hl=en",
        "4": "https:\/\/scholar.google.com\/scholar?cites=15194857180303073201&as_sdt=2005&sciodt=0,5&hl=en",
        "5": "https:\/\/scholar.google.com\/scholar?cites=13248080025875046634&as_sdt=2005&sciodt=0,5&hl=en",
        "6": "https:\/\/scholar.google.com\/scholar?cites=2537307997858018983&as_sdt=2005&sciodt=0,5&hl=en",
        "7": "https:\/\/scholar.google.com\/scholar?cites=16719333272424362284&as_sdt=2005&sciodt=0,5&hl=en",
        "8": "https:\/\/scholar.google.com\/scholar?cites=2031020440241972606&as_sdt=2005&sciodt=0,5&hl=en",
        "9": "https:\/\/scholar.google.com\/scholar?cites=16791323098365028130&as_sdt=2005&sciodt=0,5&hl=en"
    },
    "related_articles": {
        "0": "https:\/\/scholar.google.com\/scholar?q=related:EQ8shYj8Ai8J:scholar.google.com\/&scioq=Machine+learning&hl=en&as_sdt=0,5",
        "1": "https:\/\/scholar.google.com\/scholar?q=related:jF00X9UGn0cJ:scholar.google.com\/&scioq=Machine+learning&hl=en&as_sdt=0,5",
        "2": "https:\/\/scholar.google.com\/scholar?q=related:sgzh8w-wKWIJ:scholar.google.com\/&scioq=Machine+learning&hl=en&as_sdt=0,5",
        "3": "https:\/\/scholar.google.com\/scholar?q=related:pdcI9r5sCJcJ:scholar.google.com\/&scioq=Machine+learning&hl=en&as_sdt=0,5",
        "4": "https:\/\/scholar.google.com\/scholar?q=related:sR_ChBn63tIJ:scholar.google.com\/&scioq=Machine+learning&hl=en&as_sdt=0,5",
        "5": "https:\/\/scholar.google.com\/scholar?q=related:6uA6mpei2rcJ:scholar.google.com\/&scioq=Machine+learning&hl=en&as_sdt=0,5",
        "6": "https:\/\/scholar.google.com\/scholar?q=related:p7YVSi5UNiMJ:scholar.google.com\/&scioq=Machine+learning&hl=en&as_sdt=0,5",
        "7": "https:\/\/scholar.google.com\/scholar?q=related:LDE5SAcBB-gJ:scholar.google.com\/&scioq=Machine+learning&hl=en&as_sdt=0,5",
        "8": "https:\/\/scholar.google.com\/scholar?q=related:fiUuYFSiLxwJ:scholar.google.com\/&scioq=Machine+learning&hl=en&as_sdt=0,5",
        "9": "https:\/\/scholar.google.com\/scholar?q=related:IufbymTDBukJ:scholar.google.com\/&scioq=Machine+learning&hl=en&as_sdt=0,5"
    },
    "all_article_versions": {
        "0": "https:\/\/scholar.google.comNone",
        "1": "https:\/\/scholar.google.com\/scholar?cluster=5160851211484945804&hl=en&as_sdt=0,5",
        "2": "https:\/\/scholar.google.com\/scholar?cluster=7073378272324684978&hl=en&as_sdt=0,5",
        "3": "https:\/\/scholar.google.com\/scholar?cluster=10883068066968164261&hl=en&as_sdt=0,5",
        "4": "https:\/\/scholar.google.com\/scholar?cluster=15194857180303073201&hl=en&as_sdt=0,5",
        "5": "https:\/\/scholar.google.com\/scholar?cluster=13248080025875046634&hl=en&as_sdt=0,5",
        "6": "https:\/\/scholar.google.com\/scholar?cluster=2537307997858018983&hl=en&as_sdt=0,5",
        "7": "https:\/\/scholar.google.com\/scholar?cluster=16719333272424362284&hl=en&as_sdt=0,5",
        "8": "https:\/\/scholar.google.com\/scholar?cluster=2031020440241972606&hl=en&as_sdt=0,5",
        "9": "https:\/\/scholar.google.com\/scholar?cluster=16791323098365028130&hl=en&as_sdt=0,5"
    },
    "pdf_link": {
        "0": null,
        "1": "https:\/\/profs.info.uaic.ro\/~ciortuz\/SLIDES\/2017s\/ml0.pdf",
        "2": "https:\/\/web.engr.oregonstate.edu\/~tgd\/publications\/arcs.ps.gz",
        "3": "http:\/\/www.cs.cmu.edu\/~tom\/pubs\/Science-ML-2015.pdf",
        "4": "https:\/\/www.researchgate.net\/profile\/Batta-Mahesh\/publication\/344717762_Machine_Learning_Algorithms_-A_Review\/links\/5f8b2365299bf1b53e2d243a\/Machine-Learning-Algorithms-A-Review.pdf?eid=5082902844932096",
        "5": null,
        "6": "http:\/\/whdeng.cn\/Teaching\/PPT_01_Machine%20learning%20Basics.pdf",
        "7": null,
        "8": null,
        "9": null
    }
}
mum43rcc

mum43rcc2#

这将允许您将结果写入JSON文件。

import requests
from bs4 import BeautifulSoup

import json

start_url = 'https://scholar.google.com/scholar'

headers = {
    'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}

params = {
    'q': 'Machine learning',
    'hl': 'en'
}

res = requests.get(start_url, headers=headers, params=params)
soup = BeautifulSoup(res.text, 'lxml')
data = []
for result in soup.select('#gs_res_ccl_mid > [data-lid]'):
    item_dict = {}
    item_dict['title'] = result.select_one('h3 > a[href]').text
    item_dict['title_link'] = result.select_one('h3 > a[href]')['href']
    item_dict['publication_info'] = result.select_one('.gs_a').text
    item_dict['snippet'] = result.select_one('.gs_rs').text
    item_dict['cited_by'] = result.select_one("a:-soup-contains('Cited by')")['href']
    item_dict['related_articles'] = result.select_one("a:-soup-contains('Related articles')")['href']
    try:
        item_dict['all_article_versions'] = result.select_one("a.gs_nph:-soup-contains('versions')")['href']
    except TypeError:
        item_dict['all_article_versions'] = ""
    
    try:
        item_dict['pdf_link'] = result.select_one('.gs_or_ggsm > a[href]')['href']
    except TypeError: 
        item_dict['pdf_link'] = ""

    data.append(item_dict)

print(json.dumps(data, indent=4))

with open('output.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

相关问题