Google Scholar与R

m3eecexj  于 2023-07-31  发布在  Go
关注(0)|答案(1)|浏览(97)

我想从Google Scholar中提取一些基本信息,如Title_name,Author_Names,Year_Publication,Title_URL和所有Google Scholar页面的citated_by,但作为测试,我想从2页中提取信息。
本网页抓取的目的是生成一个研究列表,用于文献综述,以进行荟萃分析研究。
我一直在尝试编辑以下代码,但没有运气:

# Install and load the necessary packages
#install.packages("RSelenium")
##install.packages("rvest")
#install.packages("stringr")

library(RSelenium)
library(rvest)
library(stringr)

# Start a Selenium server and open Chrome browser

rD <- rsDriver(browser = "chrome", chromever = "latest", geckover = "latest", 
               IEDriverVersion = NULL, verbose = FALSE, check = TRUE, 
               extraCapabilities = NULL, verboseInfo = FALSE, checkInterval = 1000, 
               timeout = 20000, whitelist = NULL, checkPath = TRUE, port = 4445L, 
               phantomver = NULL, 
               chromepath = NULL, firefoxpath = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe")

remDr <- rD$client

# Define your search terms
search_terms <- "((COVID OR COVID-19))"

# Function to extract data from a page
extract_data <- function(page_source) {
  page <- read_html(page_source)
  titles <- page %>% html_nodes(".gs_rt") %>% html_text()
  authors <- page %>% html_nodes(".gs_a") %>% html_text()
  years <- str_extract(authors, "\\d{4}")
  authors <- str_replace(authors, "\\d{4}", "")
  urls <- page %>% html_nodes(".gs_rt a") %>% html_attr("href")
  cited_by <- page %>% html_nodes(".gs_fl a:nth-child(3)") %>% html_text()
  cited_by <- as.integer(str_extract(cited_by, "\\d+"))
  
  data.frame(Title_name = titles, Author_Names = authors, Year_Publication = years, Title_URL = urls, cited_by = cited_by)
}

# Function to search for a specific term on Google Scholar
search_google_scholar <- function(term) {
  tryCatch({
    remDr$navigate("https://scholar.google.com/")
    search_box <- remDr$findElement("css", "#gs_hdr_tsi")
    search_box$sendKeysToElement(list(term, key="enter"))
    Sys.sleep(5) # Allow time for page to load
    
    pages <- 2 # Number of pages to scrape 
    results <- data.frame()
    
    for (page in 1:pages) {
      page_source <- remDr$getPageSource()[[1]]
      page_data <- extract_data(page_source)
      results <- rbind(results, page_data)
      
      next_button <- remDr$findElement("css", "#gs_n a")
      if (length(next_button) == 0) {
        break
      } else {
        next_button$clickElement()
        Sys.sleep(5) # Allow time for page to load
      }
    }
    
    return(results)
  }, error = function(e) {
    message("An error occurred: ", conditionMessage(e))
    NULL
  })
}

# Execute the search and scrape the data
search_results <- search_google_scholar(search_terms)

# Close the browser
remDr$close()
rD$server$stop()

字符串
有人能帮我修改上面的代码或建议一个简单的解决方法吗?

q9yhzks0

q9yhzks01#

我找到问题所在了。谷歌Chrome没有启动,因为我缺少Chrome驱动程序。我从这里下载的:https://googlechromelabs.github.io/chrome-for-testing/#stable [将那里列出的chrome版本与我安装的chrome版本相匹配-我不得不升级我的chrome版本]。
我也编辑了代码。下面是google scholar前两页的代码的工作版本。

# Load the necessary packages
library(RSelenium)
library(rvest)
library(stringr)
# Specify the path to the chromedriver executable
#chromedriver_path <- "C:/Program Files/Google/Chrome/Application/115.0.5790.102/chromedriver.exe"  # Replace with the actual path

# Start the Selenium server on a different port (e.g., 5555)
#rD <- rsDriver(browser = "chrome", port = 5555)

# Specify the path to the chromedriver executable
chromedriver_path <- "C:/Program Files/Google/Chrome/Application/115.0.5790.102/chromedriver.exe"  # Replace with the actual path

# Set the system property for the chromedriver executable path
# This is necessary for the Java-based Selenium server used by RSelenium
Sys.setenv(CHROMEDRIVER_PATH = chromedriver_path)

# Start the Selenium server and open the Chrome browser
rD <- rsDriver(browser = "chrome")

# Get the remote driver (remDr) object
remDr <- rD[["client"]]

# Define your search terms
search_terms <- "((COVID OR COVID-19))"

# Get the remote driver (remDr) object
#remDr <- rD$client

# Set the additional capabilities with the chromedriver path
#extra_capabilities <- list(chromedriverExecutable = chromedriver_path)
#remDr$setCapabilities(extra_capabilities)


# Function to extract data from a page
extract_data <- function(page_source) {
  page <- read_html(page_source)
  titles <- page %>% html_nodes(".gs_rt") %>% html_text()
  authors <- page %>% html_nodes(".gs_a") %>% html_text()
  years <- str_extract(authors, "\\d{4}")
  authors <- str_replace(authors, "\\d{4}", "")
  urls <- page %>% html_nodes(".gs_rt a") %>% html_attr("href")
  cited_by <- page %>% html_nodes(".gs_fl a:nth-child(3)") %>% html_text()
  cited_by <- as.integer(str_extract(cited_by, "\\d+"))
  
  data.frame(Title_name = titles, Author_Names = authors, Year_Publication = years, Title_URL = urls, cited_by = cited_by)
}

# Function to search for a specific term on Google Scholar
search_google_scholar <- function(term) {
  tryCatch({
    remDr$navigate("https://scholar.google.com/")
    search_box <- remDr$findElement("css", "#gs_hdr_tsi")
    search_box$sendKeysToElement(list(term, key="enter"))
    Sys.sleep(5) # Allow time for page to load
    
    pages <- 2 # Number of pages to scrape 
    results <- data.frame()
    
    for (page in 1:pages) {
      page_source <- remDr$getPageSource()[[1]]
      page_data <- extract_data(page_source)
      results <- rbind(results, page_data)
      
      next_button <- remDr$findElement("css", "#gs_n a")
      if (length(next_button) == 0) {
        break
      } else {
        next_button$clickElement()
        Sys.sleep(5) # Allow time for page to load
      }
    }
    
    return(results)
  }, error = function(e) {
    message("An error occurred: ", conditionMessage(e))
    NULL
  })
}

# Execute the search and scrape the data
search_results <- search_google_scholar(search_terms)

# Close the browser
remDr$close()

# Stop the Selenium server
rD$server$stop()

字符串
当然,webscraping对于许多专栏来说并不完美。如HTML和PDF等不完善之处被列为作者列。因此,我需要更多地预处理数据以自动化我的文献综述,但这让我足够接近。希望对未来的研究人员有所帮助。

相关问题