rvest:UseMethod(“xml_find_all”)中的错误:没有适用的'xml_find_all'方法应用于类“list”的对象

fnvucqvd  于 2023-06-19  发布在  其他
关注(0)|答案(2)|浏览(196)

环境数据倡议(EDI)是一个存储来自多个地点的数据集的存储库。我想从一个位置(see example link here)抓取每个数据集的开始和结束日期。

  • 一个位置的每个数据集都包含一个指向元数据URL的链接,该URL列出了数据集的开始和结束日期(see example link here)。

我下面的代码尝试使用for循环来提取每个数据集(即Package Id)的唯一ID,然后用于为每个Package Id创建元数据页面URL。
但是,我的for循环在尝试从每个元数据页面中抓取开始日期时抛出了一个错误。

  • 错误:Error in UseMethod("xml_find_all") : no applicable method for 'xml_find_all' applied to an object of class "list"

如何调整for循环来提取每个Package Id的开始和结束日期?

library(rvest)
library(xml2)
library(dplyr)
library(purrr)

url <- "https://portal.edirepository.org/nis/simpleSearch?defType=edismax&q=*:*&fq=-scope:ecotrends&fq=-scope:lter-landsat*&fq=scope:(knb-lter-and)&fl=id,packageid,title,author,organization,pubdate,coordinates&debug=false&start=0&rows=150"
webpage <- read_html(url)

# Initialize vectors to store the data
package_ids <- character()
time_periods_begin <- character()
time_periods_end <- character()

# Extract the Package Id
package_ids <- webpage %>%
  html_table() %>%
  .[[4]] %>%
  select(`Package Id  ▵▿`) %>%
  rename(PackageId = `Package Id  ▵▿`)

# Iterate over each PackageId row
for (i in 1:length(package_ids$PackageId)) {
  
  # Construct the URL for the "View Full Metadata" page
  package_id_link <- paste0("https://portal.edirepository.org/nis/metadataviewer?packageid=", package_ids$PackageId)
  
  # Navigate to the "View Full Metadata" page
  metadata_page <- map(package_id_link, read_html)
  
  # Extract the Begin and End (this is where the error lives)
  time_period_begin <- html_nodes(metadata_page, "tr:contains('Begin') td:nth-child(2)") %>%
    html_text() %>%
    trimws()
  
  time_periods_begin <- c(time_periods_begin, time_period_begin)
  
  time_period_end <- html_nodes(metadata_page, "tr:contains('End') td:nth-child(2)") %>%
    html_text() %>%
    trimws()
  
  time_periods_end <- c(time_periods_end, time_period_end)
}

输出应该如下所示

# Create a data frame with Package Id, Begin, and End
data_frame <- data.frame(PackageId = package_id,
                         Begin = time_periods_begin,
                         End = time_periods_end)

data_frame

            PackageId      Begin        End
1 knb-lter-and.2719.6 1971-06-01 2002-03-11
2 knb-lter-and.2720.8 1958-01-01 1979-01-01
3 knb-lter-and.2721.6 1975-01-01 1995-01-01

更新1

我可以获取单个数据集的PackageID、开始和End。在上面的代码中,我可以获取每个数据集的元数据URL。现在只需要弄清楚如何为这147个元数据URL中的每一个提取PackageID、开始和End。

url <- "https://portal.edirepository.org/nis/metadataviewer?packageid=knb-lter-and.4525.10"
webpage <- read_html(url)

package_id <- html_text(html_nodes(webpage, "td.rowodd + td.roweven")[1])

# Extract the Begin value
time_periods_begin <- html_text(html_nodes(webpage, "td:contains('Begin:') + td")[1])

# Extract the End value
time_periods_end <- html_text(html_nodes(webpage, "td:contains('End:') + td")[1])

data_frame <- data.frame(PackageId = package_id,
                         Begin = time_periods_begin,
                         End = time_periods_end)

data_frame
gz5pxeao

gz5pxeao1#

library(tidyverse)
library(rvest)
library(janitor)

page <-
  "http://portal.edirepository.org:80/nis/simpleSearch?defType=edismax&q=*:*&fq=-scope:ecotrends&fq=-scope:lter-landsat*&fq=scope:(knb-lter-and)&fl=id,packageid,title,author,organization,pubdate,coordinates&debug=false&start=0&rows=150" %>%
  read_html()

scraper <- function(package_id) {
  cat("Scraping", package_id, "\n")
  data <- str_c("https://portal.edirepository.org/nis/metadataviewer?packageid=",
        package_id) %>%
    read_html() %>%
    html_elements(".subgroup.onehundred_percent") %>%
    pluck(1) %>%
    html_elements(".roweven") %>%
    html_text2() 
  
  tibble(begin = pluck(data, 1), 
         end = pluck(data, 2))
}

data <- page %>%
  html_table() %>%
  pluck(4) %>%
  clean_names() %>%
  mutate(across(title, ~ str_squish(str_remove_all(., "\\n")))) %>%
  mutate(date = map(package_id, scraper)) %>% 
  unnest(date)

   title                                                                                                      creators publication_date package_id begin end  
   <chr>                                                                                                      <chr>               <int> <chr>      <chr> <chr>
 1 Invertebrates of the Andrews Experimental Forest: An annotated list of insects and other arthropods, 1971… Andrews…             2014 knb-lter-… 1971… 2002…
 2 Vascular plant list on the Andrews Experimental Forest and nearby Research Natural Areas, 1958 to 1979     Andrews…             2014 knb-lter-… 1958… 1979…
 3 Bird species list for the Andrews Experimental Forest and Upper McKenzie River Basin, 1975 to 1995         Andrews…             2014 knb-lter-… 1975… 1995…
 4 Amphibian and reptile list of the Andrews Experimental Forest, 1975 to 1995                                Andrews…             2014 knb-lter-… 1975… 1995…
 5 Moss species list of the Andrews Experimental Forest, 1991                                                 Andrews…             2013 knb-lter-… 1991… 1991…
 6 Mammal species list of the Andrews Experimental Forest, 1971 to 1976                                       Anthony…             2014 knb-lter-… 1971… 1976…
 7 Ecohydrology and Ecophysiology intensively measured plots in Watershed 1, Andrews Experimental Forest, 20… Andrews…             2016 knb-lter-… 2005… 2011…
 8 A Study of Hyporheic Characteristics Along a Longitudinal Profile of Lookout Creek, Oregon, 2003           Andrews…             2013 knb-lter-… 2003… 2003…
 9 Annual tree productivity in permanent plots within the H.J. Andrews Experimental Forest                    Andrews…             2013 knb-lter-… 2000… 2004…
10 Epiphytic macrolichens in relation to forest management and topography in a western Oregon watershed, 199… Andrews…             2014 knb-lter-… 1997… 1999…
bhmjp9jg

bhmjp9jg2#

下面介绍如何从每个元数据文件中抓取包ID、开始日期和结束日期

library(rvest)
library(dplyr)

# EDI webpage for Andrews LTER datasets
url <- "http://portal.edirepository.org:80/nis/simpleSearch?defType=edismax&q=*:*&fq=-scope:ecotrends&fq=-scope:lter-landsat*&fq=scope:(knb-lter-and)&fl=id,packageid,title,author,organization,pubdate,coordinates&debug=false"
webpage <- read_html(url)

# Extract each of the Package Ids
package_ids <- webpage %>%
  html_table() %>%
  .[[4]] %>%
  select(`Package Id  ▵▿`) %>%
  rename(PackageId = `Package Id  ▵▿`)

zz <- unique(package_ids$PackageId)

# Iterate between the metadata page of each Package Id
for (i in 1:length(package_ids$PackageId)) {
  curDat = package_ids[package_ids$PackageId == zz[i],]
  
  # Construct the URL for the "View Full Metadata" page
  package_id_link <- paste0("https://portal.edirepository.org/nis/metadataviewer?packageid=", curDat)
  
  # Read the "View Full Metadata" page
  webpage <- read_html(package_id_link)
  
  # Extract Package ID, Begin Date, and End Date
  package_id <- html_text(html_nodes(webpage, "td.rowodd + td.roweven")[1])
  begin_value <- html_text(html_nodes(webpage, "td:contains('Begin:') + td")[1])
  end_value <- html_text(html_nodes(webpage, "td:contains('End:') + td")[1])
  if( i == 1){
    packageID = package_id
    time_periods_begin = begin_value
    time_periods_end = end_value
  } else{
    packageID = rbind(packageID, package_id)
    time_periods_begin = rbind(time_periods_begin, begin_value)
    time_periods_end = rbind(time_periods_end, end_value)
  }
}

data_frame <- data.frame(cbind(packageID,
                               time_periods_begin,
                               time_periods_end))

colnames(data_frame)[1:3] <- c('PackageId','Begin','End')
rownames(data_frame) <- seq(1,NROW(data_frame),1)

data_frame
              PackageId      Begin        End
1   knb-lter-and.2719.6 1971-06-01 2002-03-11
2   knb-lter-and.2720.8 1958-01-01 1979-01-01
3   knb-lter-and.2721.6 1975-01-01 1995-01-01
4   knb-lter-and.2722.6 1975-01-01 1995-01-01
5   knb-lter-and.2725.6 1991-06-01 1991-08-01
6   knb-lter-and.2726.6 1971-01-01 1976-01-01
7  knb-lter-and.4528.10 2005-09-30 2011-05-05
8   knb-lter-and.4541.3 2003-06-14 2003-11-15
9   knb-lter-and.4544.4 2000-06-01 2004-09-30
10  knb-lter-and.4547.5 1997-09-23 1999-09-15

相关问题