R语言 汇总分组数据框中多列的字符串

xam8gpfp  于 2023-02-20  发布在  其他
关注(0)|答案(2)|浏览(158)

我有这种类型的分组数据框:

df <- structure(list(TurnID = c(1L, 1L, 1L, 2L, 2L, 2L), 
                     grp = c(1L, 2L, 3L, 1L, 2L, 3L), 
                     File = c("F01", "F01", "F01", "F01", "F01", "F01"), 
                     N_p = c(3L, 3L, 3L, 3L, 3L, 3L), Line = c(6L, 6L, 6L, 7L, 7L, 7L), 
                     Speaker = c("ID01.A", "ID01.A", "ID01.A", "ID01.C", "ID01.C", "ID01.C"), 
                     Utterance = c("=yeah (...) yeah yeah", "=yeah (...) yeah yeah", "=yeah (...) yeah yeah", "[(...)]", "[(...)]", "[(...)]"), 
                     A_aoi = c("C", "*", "C", "C", NA, NA), 
                     B_aoi = c("A", NA, NA, "A", NA, NA), 
                     C_aoi = c("A", "*", NA, "A", "*", "A"), 
                     A_aoi_dur = c(310L, 499L, 1201L, 2051L, NA, NA), 
                     B_aoi_dur = c(2010L, NA, NA, 2051L, NA, NA), 
                     C_aoi_dur = c(945L, 1065L, NA, 88L, 1660L, 303L)), 
                class = c("grouped_df", "tbl_df", "tbl", "data.frame"), 
                row.names = c(NA, -6L), 
                groups = structure(list(TurnID = 1:2, .rows = structure(list(1:3, 4:6), ptype = integer(0), class = c("vctrs_list_of",  "vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"), 
                                   row.names = c(NA, -2L), .drop = TRUE))

当我试图通过分组变量TurnID(已在数据框中激活)汇总*aoi列中的值时,得到了错误的结果:

df %>%
  #group_by(TurnID) %>%
  summarise(across(c(File, N_p, Line, Speaker, Utterance), first),
    A_aoi = str_c(A_aoi, collapse = ""),
    B_aoi = str_c(B_aoi, collapse = ""),
    C_aoi = str_c(C_aoi, collapse = ""),
    A_aoi_dur = str_c(A_aoi_dur, collapse = ","),
    B_aoi_dur = str_c(B_aoi_dur, collapse = ","),
    C_aoi_dur = str_c(C_aoi_dur, collapse = ",")
  )
# A tibble: 2 × 12
  TurnID File    N_p  Line Speaker Utterance             A_aoi B_aoi C_aoi A_aoi_dur    B_aoi_dur C_aoi_dur  
   <int> <chr> <int> <int> <chr>   <chr>                 <chr> <chr> <chr> <chr>        <chr>     <chr>      
1      1 F01       3     6 ID01.A  =yeah (...) yeah yeah C*C   NA    NA    310,499,1201 NA        NA         
2      2 F01       3     7 ID01.C  [(...)]               NA    NA    A*A   NA           NA        88,1660,303

正确的结果应为:

TurnID File    N_p  Line Speaker Utterance             A_aoi B_aoi C_aoi   A_aoi_dur      B_aoi_dur C_aoi_dur  
   <int> <chr> <int> <int> <chr>   <chr>                 <chr> <chr> <chr>   <chr>          <chr>     <chr>      
1      1 F01       3     6 ID01.A  =yeah (...) yeah yeah C*C    A    A*      310,499,1201   2010      945,1201         
2      2 F01       3     7 ID01.C  [(...)]               C      A    A*A     2051           2051      88,1660,303

我肯定问题出在分组上,但不知道如何解决...

cnjp1d6j

cnjp1d6j1#

您应该删除NA。您可以使用na.omit

df %>%
  group_by(TurnID) %>%
  summarise(across(c(File, N_p, Line, Speaker, Utterance), first),
            across(matches("aoi$"), ~ str_c(na.omit(.x), collapse = "")),
            across(matches("dur$"), ~ str_c(na.omit(.x), collapse = ",")))

输出

# A tibble: 2 × 12
  TurnID File    N_p  Line Speaker Utter…¹ A_aoi B_aoi C_aoi A_aoi…² B_aoi…³ C_aoi…⁴
   <int> <chr> <int> <int> <chr>   <chr>   <chr> <chr> <chr> <chr>   <chr>   <chr>  
1      1 F01       3     6 ID01.A  =yeah … C*C   A     A*    310,49… 2010    945,10…
2      2 F01       3     7 ID01.C  [(...)] C     A     A*A   2051    2051    88,166…
# … with abbreviated variable names ¹​Utterance, ²​A_aoi_dur, ³​B_aoi_dur, ⁴​C_aoi_dur

您也可以使用str_flatten,它有一个na.rm参数:

df %>%
  group_by(TurnID) %>%
  summarise(across(c(File, N_p, Line, Speaker, Utterance), first),
            across(matches("aoi$"), ~ str_flatten(.x, "", na.rm = TRUE)),
            across(matches("dur$"), ~ str_flatten(.x, ",", na.rm = TRUE)))
jjjwad0x

jjjwad0x2#

下面是一种data.table方法

library(data.table)
cols.collapse <- grep("aoi", names(df), value = TRUE)
setDT(df)[, lapply(.SD, function(x) paste0(x[!is.na(x)], collapse = ";")), 
          .SDcols = cols.collapse, by = .(TurnID)]

#    TurnID A_aoi B_aoi C_aoi    A_aoi_dur B_aoi_dur   C_aoi_dur
# 1:      1 C;*;C     A   A;* 310;499;1201      2010    945;1065
# 2:      2     C     A A;*;A         2051      2051 88;1660;303

相关问题