我有这种类型的分组数据框:
df <- structure(list(TurnID = c(1L, 1L, 1L, 2L, 2L, 2L),
grp = c(1L, 2L, 3L, 1L, 2L, 3L),
File = c("F01", "F01", "F01", "F01", "F01", "F01"),
N_p = c(3L, 3L, 3L, 3L, 3L, 3L), Line = c(6L, 6L, 6L, 7L, 7L, 7L),
Speaker = c("ID01.A", "ID01.A", "ID01.A", "ID01.C", "ID01.C", "ID01.C"),
Utterance = c("=yeah (...) yeah yeah", "=yeah (...) yeah yeah", "=yeah (...) yeah yeah", "[(...)]", "[(...)]", "[(...)]"),
A_aoi = c("C", "*", "C", "C", NA, NA),
B_aoi = c("A", NA, NA, "A", NA, NA),
C_aoi = c("A", "*", NA, "A", "*", "A"),
A_aoi_dur = c(310L, 499L, 1201L, 2051L, NA, NA),
B_aoi_dur = c(2010L, NA, NA, 2051L, NA, NA),
C_aoi_dur = c(945L, 1065L, NA, 88L, 1660L, 303L)),
class = c("grouped_df", "tbl_df", "tbl", "data.frame"),
row.names = c(NA, -6L),
groups = structure(list(TurnID = 1:2, .rows = structure(list(1:3, 4:6), ptype = integer(0), class = c("vctrs_list_of", "vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"),
row.names = c(NA, -2L), .drop = TRUE))
当我试图通过分组变量TurnID
(已在数据框中激活)汇总*aoi
列中的值时,得到了错误的结果:
df %>%
#group_by(TurnID) %>%
summarise(across(c(File, N_p, Line, Speaker, Utterance), first),
A_aoi = str_c(A_aoi, collapse = ""),
B_aoi = str_c(B_aoi, collapse = ""),
C_aoi = str_c(C_aoi, collapse = ""),
A_aoi_dur = str_c(A_aoi_dur, collapse = ","),
B_aoi_dur = str_c(B_aoi_dur, collapse = ","),
C_aoi_dur = str_c(C_aoi_dur, collapse = ",")
)
# A tibble: 2 × 12
TurnID File N_p Line Speaker Utterance A_aoi B_aoi C_aoi A_aoi_dur B_aoi_dur C_aoi_dur
<int> <chr> <int> <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 F01 3 6 ID01.A =yeah (...) yeah yeah C*C NA NA 310,499,1201 NA NA
2 2 F01 3 7 ID01.C [(...)] NA NA A*A NA NA 88,1660,303
正确的结果应为:
TurnID File N_p Line Speaker Utterance A_aoi B_aoi C_aoi A_aoi_dur B_aoi_dur C_aoi_dur
<int> <chr> <int> <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 F01 3 6 ID01.A =yeah (...) yeah yeah C*C A A* 310,499,1201 2010 945,1201
2 2 F01 3 7 ID01.C [(...)] C A A*A 2051 2051 88,1660,303
我肯定问题出在分组上,但不知道如何解决...
2条答案
按热度按时间cnjp1d6j1#
您应该删除NA。您可以使用
na.omit
:输出
您也可以使用
str_flatten
,它有一个na.rm
参数:jjjwad0x2#
下面是一种
data.table
方法