regex 基于正则表达式模式折叠行

xqkwcwgp  于 2023-08-08  发布在  其他
关注(0)|答案(1)|浏览(111)

我有语音转录与扬声器ID在speakertimestamp s当讲话发生:

  1. df
  2. line speaker utterance timestamp
  3. 1007 0504 A <and then HH. somehow> and then 00:09:08.951 - 00:09:18.195
  4. 1009 0505 B [mhm] 00:09:13.518 - 00:09:13.802
  5. 1011 0506 B [yeah yeah yeah] 00:09:15.518 - 00:09:15.959
  6. 1013 0507 <NA> (0.484) 00:09:18.195 - 00:09:18.679
  7. 1015 0508 A I do n't know if you 00:09:18.679 - 00:09:21.478
  8. 1017 0509 <NA> (0.287) 00:09:21.478 - 00:09:21.765
  9. 1019 0510 B yeah the organization right? °yeah 00:09:21.765 - 00:09:23.285
  10. 1021 0511 A [yeah it 's a big] international 00:09:23.171 - 00:09:27.902
  11. 1023 0512 B [yeah] 00:09:25.096 - 00:09:25.316
  12. 1025 0513 B (0.393) 00:09:27.902 - 00:09:28.295
  13. 1027 0514 B mhm= 00:09:28.295 - 00:09:28.508
  14. 1029 0515 <NA> (0.019) 00:09:28.508 - 00:09:28.527
  15. 1031 0516 A =so they have like 00:09:28.527 - 00:09:29.133
  16. 1033 0517 A (0.500) 00:09:29.133 - 00:09:29.633
  17. 1035 0518 A normally I do n't know about 00:09:29.633 - 00:09:34.381
  18. 1037 0519 <NA> (1.497) 00:09:34.381 - 00:09:35.878
  19. 1039 0520 B one wi:th economics, [er like uh] 00:09:35.878 - 00:09:44.639
  20. 1041 0521 B [mhm] 00:09:37.389 - 00:09:38.041
  21. 1043 0522 B [mhm] 00:09:44.237 - 00:09:44.622
  22. 1045 0523 <NA> (0.645) 00:09:44.639 - 00:09:45.284
  23. 1047 0524 A U:m 00:09:45.284 - 00:09:45.647

字符串
我需要折叠**(i)相同的扬声器(ii)的那些行,其中utterance不以方括号([...])中的表达式开始**。EDIT我还需要 * 豁免 * 那些由相同speaker[...]跟随的行折叠,直到下一个NA。所有这一切,同时相应地收缩折叠行的时间戳。I can 执行条件(i)的操作:

  1. library(dplyr)
  2. library(stringr)
  3. library(data.table)
  4. df %>%
  5. group_by(grp = rleid(speaker)) %>%
  6. summarise(across(c(line, speaker), first),
  7. utterance = str_c(utterance, collapse = ' '),
  8. timestamp = paste(unlist(strsplit(timestamp, "[- ]+"))[c(1, n()*2)], collapse = " - "), .groups = 'drop') %>%
  9. select(-grp)


但我很难实施条件(ii)。EDIT:使用filter(!grepl("^\\[.*?\\]", utterance)) %>%至少会删除[...]的行。但是如何 * 不 * 折叠后面的行,直到下一个NA,我不知道。任何帮助都非常感谢!

预期效果

  1. df
  2. line speaker utterance timestamp
  3. 1007 0504 A <and then HH. somehow> and then 00:09:08.951 - 00:09:18.195
  4. 1009 0505 B [mhm] 00:09:13.518 - 00:09:13.802
  5. 1011 0506 B [yeah yeah yeah] 00:09:15.518 - 00:09:15.959
  6. 1013 0507 <NA> (0.484) 00:09:18.195 - 00:09:18.679
  7. 1015 0508 A I do n't know if you 00:09:18.679 - 00:09:21.478
  8. 1017 0509 <NA> (0.287) 00:09:21.478 - 00:09:21.765
  9. 1019 0510 B yeah the organization right? °yeah 00:09:21.765 - 00:09:23.285
  10. 1021 0511 A [yeah it 's a big] international 00:09:23.171 - 00:09:27.902
  11. 1023 0512 B [yeah] 00:09:25.096 - 00:09:25.316
  12. 1025 0513 B (0.393) 00:09:27.902 - 00:09:28.295
  13. 1027 0514 B mhm= 00:09:28.295 - 00:09:28.508
  14. 1029 0515 <NA> (0.019) 00:09:28.508 - 00:09:28.527
  15. 1031 0516 A =so they have like (0.500) normally I do n't know about 00:09:28.527 - 00:09:34.381
  16. 1037 0519 <NA> (1.497) 00:09:34.381 - 00:09:35.878
  17. 1039 0520 B one wi:th economics, [er like uh] [mhm] [mhm] 00:09:35.878 - 00:09:44.622
  18. 1045 0523 <NA> (0.645) 00:09:44.639 - 00:09:45.284
  19. 1047 0524 A U:m 00:09:45.284 - 00:09:45.647

可重现数据

  1. structure(list(line = c("0504", "0505", "0506", "0507", "0508",
  2. "0509", "0510", "0511", "0512", "0513", "0514", "0515", "0516",
  3. "0517", "0518", "0519", "0520", "0521", "0522", "0523", "0524"
  4. ), speaker = c("A", "B", "B", NA, "A", NA, "B", "A", "B", "B",
  5. "B", NA, "A", "A", "A", NA, "B", "B", "B", NA, "A"), utterance = c("<and then HH. somehow> and then",
  6. "[mhm]", "[yeah yeah yeah]", "(0.484)", "I do n't know if you",
  7. "(0.287)", "yeah the organization right? °yeah", "[yeah it 's a big] international",
  8. "[yeah]", "(0.393)", "mhm=", "(0.019)", "=so they have like",
  9. "(0.500)", "normally I do n't know about", "(1.497)", "one wi:th economics, [er like uh]",
  10. "[mhm]", "[mhm]", "(0.645)", "U:m"), timestamp = c("00:09:08.951 - 00:09:18.195",
  11. "00:09:13.518 - 00:09:13.802", "00:09:15.518 - 00:09:15.959",
  12. "00:09:18.195 - 00:09:18.679", "00:09:18.679 - 00:09:21.478",
  13. "00:09:21.478 - 00:09:21.765", "00:09:21.765 - 00:09:23.285",
  14. "00:09:23.171 - 00:09:27.902", "00:09:25.096 - 00:09:25.316",
  15. "00:09:27.902 - 00:09:28.295", "00:09:28.295 - 00:09:28.508",
  16. "00:09:28.508 - 00:09:28.527", "00:09:28.527 - 00:09:29.133",
  17. "00:09:29.133 - 00:09:29.633", "00:09:29.633 - 00:09:34.381",
  18. "00:09:34.381 - 00:09:35.878", "00:09:35.878 - 00:09:44.639",
  19. "00:09:37.389 - 00:09:38.041", "00:09:44.237 - 00:09:44.622",
  20. "00:09:44.639 - 00:09:45.284", "00:09:45.284 - 00:09:45.647")), row.names = c(1007L,
  21. 1009L, 1011L, 1013L, 1015L, 1017L, 1019L, 1021L, 1023L, 1025L,
  22. 1027L, 1029L, 1031L, 1033L, 1035L, 1037L, 1039L, 1041L, 1043L,
  23. 1045L, 1047L), class = "data.frame")

pbwdgjma

pbwdgjma1#

  1. library(tidyverse)
  2. process_transcript <- function(utterances, speaker_col) {
  3. utterance_group <- 1
  4. out <- c()
  5. current_speaker <- speaker_col[1]
  6. square_brackets <- FALSE
  7. for (i in seq_along(utterances)) {
  8. speaking <- speaker_col[i]
  9. if (is.na(speaking)) {
  10. speaking <- "NA"
  11. }
  12. square_brackets <- substr(utterances[i], 1, 1) == "[" | square_brackets
  13. if (speaking != current_speaker) {
  14. utterance_group <- utterance_group + 1
  15. current_speaker <- speaking
  16. square_brackets <- substr(utterances[i], 1, 1) == "["
  17. } else if (square_brackets) {
  18. utterance_group <- utterance_group + 1
  19. }
  20. out <- c(out, utterance_group)
  21. }
  22. return(out)
  23. }
  24. df %>%
  25. separate(timestamp, c("start", "end"), sep = " - ") %>%
  26. mutate(utterance_group = process_transcript(utterance, speaker)) %>%
  27. group_by(utterance_group) %>%
  28. mutate(utterance = paste(utterance, collapse = " "),
  29. start = min(start),
  30. end = max(end)) %>%
  31. ungroup()
  32. # A tibble: 21 × 6
  33. line speaker utterance start end utterance_group
  34. <chr> <chr> <chr> <chr> <chr> <dbl>
  35. 1 0504 A <and then HH. somehow> and then 00:0 00:0 1
  36. 2 0505 B [mhm] 00:0 00:0 2
  37. 3 0506 B [yeah yeah yeah] 00:0 00:0 3
  38. 4 0507 NA (0.484) 00:0 00:0 4
  39. 5 0508 A I do n't know if you 00:0… 00:0… 5
  40. 6 0509 NA (0.287) 00:0… 00:0… 6
  41. 7 0510 B yeah the organization right? °yeah 00:0… 00:0… 7
  42. 8 0511 A [yeah it 's a big] international 00:0 00:0 8
  43. 9 0512 B [yeah] 00:0 00:0 9
  44. 10 0513 B (0.393) 00:0 00:0 10
  45. 11 0514 B mhm= 00:0 00:0 11
  46. 12 0515 NA (0.019) 00:0 00:0 12
  47. 13 0516 A =so they have like (0.500) normall 00:0 00:0 13
  48. 14 0517 A =so they have like (0.500) normall 00:0 00:0 13
  49. 15 0518 A =so they have like (0.500) normall 00:0 00:0 13
  50. 16 0519 NA (1.497) 00:0 00:0 14
  51. 17 0520 B one wi:th economics, [er like uh] 00:0 00:0 15
  52. 18 0521 B [mhm] 00:0 00:0 16
  53. 19 0522 B [mhm] 00:0 00:0 17
  54. 20 0523 NA (0.645) 00:0 00:0 18
  55. 21 0524 A U:m 00:0 00:0 19

字符串

展开查看全部

相关问题