R语言 如果一列中有多个重复项,如何只保留第一组重复项

x9ybnkn6  于 2023-06-03  发布在  其他
关注(0)|答案(2)|浏览(585)

clin.info$Sample.ID有重复项。如果有多对重复项,我希望只取第一对。

n_occur <- data.frame(table(clin.info$Sample.ID))
multiple.duplicates <- n_occur[n_occur$Freq > 2,]

if(multiple.duplicates$Var1 %in% clin.info$Sample.ID){
  clin.info <- clin.info %>% 
    group_by(Sample.ID) %>% 
    distinct
}

追溯:

Error in if (multiple.duplicates$Var1 %in% clin.info$Sample.ID) { : 
  argument is of length zero

数据:

> dput(clin.info)
structure(list(Sample.ID = c("TCGA.B2.3924.01", "TCGA.B2.3924.01", 
"TCGA.B2.3924.01", "TCGA.B2.3924.01", "TCGA.B2.5635.01", "TCGA.B2.5635.01", 
"TCGA.B2.5635.01", "TCGA.B2.5635.01", "TCGA.B2.5635.01", "TCGA.B2.5635.01", 
"TCGA.A3.3357.01", "TCGA.A3.3357.01", "TCGA.A3.3367.01", "TCGA.A3.3367.01", 
"TCGA.A3.3387.01", "TCGA.A3.3387.01", "TCGA.B0.4698.01", "TCGA.B0.4698.01", 
"TCGA.B0.4710.01", "TCGA.B0.4710.01"), age = c("73", "73", "73", 
"73", "74", "74", "74", "74", "74", "74", "62", "62", "72", "72", 
"49", "49", "75", "75", "75", "75")), row.names = c(67L, 68L, 
69L, 70L, 71L, 72L, 73L, 74L, 75L, 76L, 1L, 2L, 3L, 4L, 5L, 6L, 
7L, 8L, 9L, 10L), class = "data.frame")

> dput(multiple.duplicates)
structure(list(Var1 = structure(6:7, levels = c("TCGA.A3.3357.01", 
"TCGA.A3.3367.01", "TCGA.A3.3387.01", "TCGA.B0.4698.01", "TCGA.B0.4710.01", 
"TCGA.B2.3924.01", "TCGA.B2.5635.01"), class = "factor"), Freq = c(4L, 
6L)), row.names = 6:7, class = "data.frame")

预期输出:
基于multiple.duplicates,有两个Sample.ID值,其中有多个重复值。
因此,对于这两个Sample.ID,仅保留clin.info中的第一组副本。

dz6r00yl

dz6r00yl1#

dplyr::slice_head(clin.info, n = 2, by = Sample.ID)
#>          Sample.ID age
#> 1  TCGA.B2.3924.01  73
#> 2  TCGA.B2.3924.01  73
#> 3  TCGA.B2.5635.01  74
#> 4  TCGA.B2.5635.01  74
#> 5  TCGA.A3.3357.01  62
#> 6  TCGA.A3.3357.01  62
#> 7  TCGA.A3.3367.01  72
#> 8  TCGA.A3.3367.01  72
#> 9  TCGA.A3.3387.01  49
#> 10 TCGA.A3.3387.01  49
#> 11 TCGA.B0.4698.01  75
#> 12 TCGA.B0.4698.01  75
#> 13 TCGA.B0.4710.01  75
#> 14 TCGA.B0.4710.01  75

创建于2023-05-28带有reprex v2.0.2
输入数据:

clin.info <-
structure(list(Sample.ID = c("TCGA.B2.3924.01", "TCGA.B2.3924.01", 
"TCGA.B2.3924.01", "TCGA.B2.3924.01", "TCGA.B2.5635.01", "TCGA.B2.5635.01", 
"TCGA.B2.5635.01", "TCGA.B2.5635.01", "TCGA.B2.5635.01", "TCGA.B2.5635.01", 
"TCGA.A3.3357.01", "TCGA.A3.3357.01", "TCGA.A3.3367.01", "TCGA.A3.3367.01", 
"TCGA.A3.3387.01", "TCGA.A3.3387.01", "TCGA.B0.4698.01", "TCGA.B0.4698.01", 
"TCGA.B0.4710.01", "TCGA.B0.4710.01"), age = c("73", "73", "73", 
"73", "74", "74", "74", "74", "74", "74", "62", "62", "72", "72", 
"49", "49", "75", "75", "75", "75")), row.names = c(67L, 68L, 
69L, 70L, 71L, 72L, 73L, 74L, 75L, 76L, 1L, 2L, 3L, 4L, 5L, 6L, 
7L, 8L, 9L, 10L), class = "data.frame")
xuo3flqw

xuo3flqw2#

你可以使用下面的代码:

dedup <- clin.info %>%
  group_by(Sample.ID) %>%
  filter(n() > 2) %>%
  distinct() %>% ungroup()

if (dim(dedup)[1] >0) {
    result <- clin.info %>%
    filter(!(Sample.ID  %in% dedup$Sample.ID)) %>%
    bind_rows(dedup)
}

相关问题