R语言 如何遍历列表并将结果存储在数据框中?

jxct1oxe  于 2023-10-13  发布在  其他
关注(0)|答案(1)|浏览(146)

这是我的问题here的后续。
我想学习如何通过列表循环,然后存储(或绑定)的结果在一起。
目标是首先创建一个游戏id列表,将其附加到一个通用url,然后循环这些以获得boxscore表,最后将结果存储(或将它们添加到一起)在一个嵌套框架中。
game_ids的定义如下:

game_ids <- list(401559239,401559240)

URL:

url_ = "https://www.espn.com/nhl/boxscore/_/gameId"

然后,我将循环的代码添加到建议解决方案的开头:

for (game_id in game_ids) {
  url2 = paste(url_, game_id, sep = '/')

完整代码:

library(rvest)
library(dplyr)
library(purrr)
library(tidyr)

game_ids <- list(401559239,401559240)
url_ = "https://www.espn.com/nhl/boxscore/_/gameId"
    
for (game_id in game_ids) {
  url2 = paste(url_, game_id, sep = '/')
  
  boxscore <- read_html(url2) %>% 
    # extract team sections (2)
    html_elements("div.Boxscore div.Wrapper") %>% 
    # extract team names, use as list element names
    set_names(html_elements(., ".BoxscoreItem__TeamName") %>% html_text()) %>% 
    # extact table elements, 4 per team
    map(\(team_section) html_elements(team_section, "table")) %>% 
    map(\(team_tables) list(
      # bind tables 1 & 2 (skaters/defensemen and data section)
      tbl_1 = html_table(team_tables[1:2]) %>% 
        bind_cols(.name_repair = "minimal") %>% 
        # column names from the first row
        set_names(.[1,]) %>% 
        rename(player = Skaters) %>% 
        # position to spearate column
        mutate(position = if_else(G == "G", player, NA), .before = 1) %>% 
        fill(position, .direction = "down") %>% 
        # remove rows with header info
        filter(G != "G"),
      # bind tables 3 & 4 (goalies and data section)
      tbl_2 = html_table(team_tables[3:4]) %>% 
        bind_cols(.name_repair = "minimal") %>% 
        set_names(.[1,]) %>% 
        filter(SA != "SA")
    ) 
    )
  output = boxscore %>% 
    map("tbl_1") %>% 
    list_rbind(names_to = "team")   
}

我把它添加到循环的末尾(在list_rbind之后):

print(res)

我可以看到两个游戏的结果:

[1] "https://www.espn.com/nhl/boxscore/_/gameId/401559239"
# A tibble: 36 × 22
   team  position player G     A     `+/-` S     SM    BS    PN    PIM   HT    TK    GV    SHFT  TOI   PPTOI SHTOI
   <chr> <chr>    <chr>  <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
 1 Mont… Skaters  J. An… 0     0     -1    3     1     1     1     2     2     0     1     25    17:31 1:46  0:00 
 2 Mont… Skaters  C. Ca… 1     0     0     1     1     1     0     0     2     0     1     23    19:11 3:59  0:00 
 3 Mont… Skaters  K. Da… 0     2     2     3     0     0     0     0     3     1     0     23    21:22 3:59  0:15 
 4 Mont… Skaters  J. Ev… 1     0     1     1     0     0     1     2     3     2     0     19    11:22 0:12  2:27 
 5 Mont… Skaters  B. Ga… 0     0     0     0     0     1     0     0     1     0     0     17    12:34 1:50  0:00 
 6 Mont… Skaters  R. Ha… 0     1     0     0     1     2     0     0     3     0     1     18    13:39 0:11  2:56 
 7 Mont… Skaters  S. Mo… 0     0     -1    3     2     1     0     0     0     1     0     22    18:51 4:00  1:10 
 8 Mont… Skaters  A. Ne… 2     0     1     2     0     0     1     2     2     0     2     22    16:46 1:49  0:00 
 9 Mont… Skaters  T. Pe… 0     0     0     0     1     0     0     0     4     0     0     18    12:10 0:00  1:10 
10 Mont… Skaters  J. Sl… 0     1     2     1     3     0     0     0     3     2     1     20    15:25 1:28  0:00 
# ℹ 26 more rows
# ℹ 4 more variables: ESTOI <chr>, FW <chr>, FL <chr>, `FO%` <chr>
# ℹ Use `print(n = ...)` to see more rows
[1] "https://www.espn.com/nhl/boxscore/_/gameId/401559240"
# A tibble: 35 × 22
   team  position player G     A     `+/-` S     SM    BS    PN    PIM   HT    TK    GV    SHFT  TOI   PPTOI SHTOI
   <chr> <chr>    <chr>  <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
 1 Otta… Skaters  D. Ba… 0     0     -1    5     3     0     0     0     1     0     0     24    18:21 4:44  0:18 
 2 Otta… Skaters  R. Ch… 0     0     1     3     0     0     0     0     4     0     0     24    13:28 0:07  2:51 
 3 Otta… Skaters  C. Gi… 0     0     -1    1     1     1     1     2     1     0     1     29    19:48 4:19  1:45 
 4 Otta… Skaters  R. Gr… 0     0     -1    2     0     0     0     0     0     0     0     26    16:04 3:49  3:15 
 5 Otta… Skaters  M. Jo… 1     1     1     1     2     2     0     0     0     1     0     29    17:52 0:21  6:00 
 6 Otta… Skaters  M. Ka… 0     0     -1    0     0     1     1     2     0     0     0     15    7:33  0:00  0:33 
 7 Otta… Skaters  P. Ke… 1     1     0     2     1     0     0     0     0     0     0     25    12:40 0:14  4:42 
 8 Otta… Skaters  D. Ku… 0     0     0     3     0     0     0     0     0     0     1     21    14:32 2:58  0:00 
 9 Otta… Skaters  T. St… 1     0     -1    3     0     0     2     4     1     0     2     28    21:44 5:25  2:12 
10 Otta… Skaters  V. Ta… 0     0     0     0     1     0     0     0     1     0     0     23    13:12 3:00  0:00 
# ℹ 25 more rows
# ℹ 4 more variables: ESTOI <chr>, FW <chr>, FL <chr>, `FO%` <chr>
# ℹ Use `print(n = ...)` to see more rows

但我不知道如何将它们绑定在一个数据框中。
我试了res = rbind(output)res = do.call(rbind,output)
但两者都只返回最后一场比赛的结果:

> head(res,10)
# A tibble: 10 × 22
   team  position player G     A     `+/-` S     SM    BS    PN    PIM   HT    TK    GV    SHFT  TOI   PPTOI SHTOI
   <chr> <chr>    <chr>  <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
 1 Otta… Skaters  D. Ba… 0     0     -1    5     3     0     0     0     1     0     0     24    18:21 4:44  0:18 
 2 Otta… Skaters  R. Ch… 0     0     1     3     0     0     0     0     4     0     0     24    13:28 0:07  2:51 
 3 Otta… Skaters  C. Gi… 0     0     -1    1     1     1     1     2     1     0     1     29    19:48 4:19  1:45 
 4 Otta… Skaters  R. Gr… 0     0     -1    2     0     0     0     0     0     0     0     26    16:04 3:49  3:15 
 5 Otta… Skaters  M. Jo… 1     1     1     1     2     2     0     0     0     1     0     29    17:52 0:21  6:00 
 6 Otta… Skaters  M. Ka… 0     0     -1    0     0     1     1     2     0     0     0     15    7:33  0:00  0:33 
 7 Otta… Skaters  P. Ke… 1     1     0     2     1     0     0     0     0     0     0     25    12:40 0:14  4:42 
 8 Otta… Skaters  D. Ku… 0     0     0     3     0     0     0     0     0     0     1     21    14:32 2:58  0:00 
 9 Otta… Skaters  T. St… 1     0     -1    3     0     0     2     4     1     0     2     28    21:44 5:25  2:12 
10 Otta… Skaters  V. Ta… 0     0     0     0     1     0     0     0     1     0     0     23    13:12 3:00  0:00 
# ℹ 4 more variables: ESTOI <chr>, FW <chr>, FL <chr>, `FO%` <chr>

如何存储每个游戏的期望输出,并添加到 Dataframe ?

v1uwarro

v1uwarro1#

有了所有的代码,很难发现问题。:)然而,解决方案很简单:
在每次迭代中,您将覆盖输出变量。因此,最后你只能看到最后一次迭代的游戏。为了解决这个问题,你可以在循环之前创建一个空列表作为输出,并在每次迭代中填充它。之后,您可以绑定行。这样的东西应该是可行的:

output <- list()
for (game_id in game_ids) {
        
        url2 = paste(url_, game_id, sep = '/')
        
        boxscore <- read_html(url2) %>%
                # extract team sections (2)
                html_elements("div.Boxscore div.Wrapper") %>%
                # extract team names, use as list element names
                set_names(html_elements(., ".BoxscoreItem__TeamName") %>% html_text()) %>%
                # extact table elements, 4 per team
                map(\(team_section) html_elements(team_section, "table")) %>%
                map(
                        \(team_tables) list(
                                # bind tables 1 & 2 (skaters/defensemen and data section)
                                tbl_1 = html_table(team_tables[1:2]) %>%
                                        bind_cols(.name_repair = "minimal") %>%
                                        # column names from the first row
                                        set_names(.[1, ]) %>%
                                        dplyr::rename(player = Skaters) %>%
                                        # position to spearate column
                                        mutate(
                                                position = if_else(G == "G", player, NA),
                                                .before = 1
                                        ) %>%
                                        fill(position, .direction = "down") %>%
                                        # remove rows with header info
                                        filter(G != "G"),
                                # bind tables 3 & 4 (goalies and data section)
                                tbl_2 = html_table(team_tables[3:4]) %>%
                                        bind_cols(.name_repair = "minimal") %>%
                                        set_names(.[1, ]) %>%
                                        filter(SA != "SA")
                        )
                )
        output[[as.character(game_id)]] <-  boxscore %>%
                map("tbl_1") %>%
                list_rbind(names_to = "team")
}
output <- output %>% bind_rows(.id = "game")

注意:我必须将output[[as.character(game_id)]]中的game_id改为字符,因为索引与数字不一样。

相关问题