R语言 删除单个观测值,然后复制剩余观测值

beq87vna  于 2023-04-27  发布在  其他
关注(0)|答案(4)|浏览(128)

对于以下数据,我想做以下操作:
[1]删除该ID仅有一个重复测量/观测的观测
[2]如果剩余的id的重复测量少于5次,重复这些观察直到至少有5个观察(例如,如果一个id有2行,重复3次)
数据:

structure(list(id = c("0101", "0102", "0102", "0103", "0103", 
"0103", "0104", "0104", "0104", "0104", "0104", "0105", "0105", 
"0105", "0105", "0105", "0106", "0106", "0106", "0106", "0106", 
"0107", "0107", "0107", "0107", "0107", "0108", "0108", "0108", 
"0108"), date = c("10/01/91", "12/03/91", "05/05/92", "06/22/92", 
"12/17/92", "07/14/93", "07/28/92", "01/14/93", "08/11/93", "02/03/94", 
"08/23/94", "09/24/92", "03/05/93", "10/18/93", "04/14/94", "05/31/94", 
"01/13/93", "07/27/93", "03/10/94", "09/01/94", "03/09/95", "01/15/93", 
"07/23/93", "02/07/94", "07/28/94", "02/07/95", "03/19/93", "10/04/93", 
"05/17/94", "11/15/94"), y = c(0, 0, 9, 0, -11, -11, 0, 10, 9, 
4, 5, 0, -7, -17, -13, -17, 0, 6, 6, 1, 3, 0, -9, -13, -18, -17, 
0, -8, -8, -10)), row.names = c(1L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 
10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 
23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L), class = "data.frame")
pbpqsu0x

pbpqsu0x1#

看起来很简单,但我可能误解了;这能解决你的问题吗?

library(tidyverse)

df <- structure(list(id = c("0101", "0102", "0102", "0103", "0103", 
                            "0103", "0104", "0104", "0104", "0104", "0104", "0105", "0105", 
                            "0105", "0105", "0105", "0106", "0106", "0106", "0106", "0106", 
                            "0107", "0107", "0107", "0107", "0107", "0108", "0108", "0108", 
                            "0108"), date = c("10/01/91", "12/03/91", "05/05/92", "06/22/92", 
                                              "12/17/92", "07/14/93", "07/28/92", "01/14/93", "08/11/93", "02/03/94", 
                                              "08/23/94", "09/24/92", "03/05/93", "10/18/93", "04/14/94", "05/31/94", 
                                              "01/13/93", "07/27/93", "03/10/94", "09/01/94", "03/09/95", "01/15/93", 
                                              "07/23/93", "02/07/94", "07/28/94", "02/07/95", "03/19/93", "10/04/93", 
                                              "05/17/94", "11/15/94"), y = c(0, 0, 9, 0, -11, -11, 0, 10, 9, 
                                                                             4, 5, 0, -7, -17, -13, -17, 0, 6, 6, 1, 3, 0, -9, -13, -18, -17, 
                                                                             0, -8, -8, -10)), row.names = c(1L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 
                                                                                                             10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 
                                                                                                             23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L), class = "data.frame")

df %>%
  group_by(id) %>%
  filter(n() >= 2) %>%
  expand_grid(count = 1:5) %>%
  group_by(id) %>%
  slice_max(order_by = count, n = 5, 
            with_ties = FALSE) %>%
  select(-count)
#> # A tibble: 35 × 3
#> # Groups:   id [7]
#>    id    date         y
#>    <chr> <chr>    <dbl>
#>  1 0102  12/03/91     0
#>  2 0102  05/05/92     9
#>  3 0102  12/03/91     0
#>  4 0102  05/05/92     9
#>  5 0102  12/03/91     0
#>  6 0103  06/22/92     0
#>  7 0103  12/17/92   -11
#>  8 0103  07/14/93   -11
#>  9 0103  06/22/92     0
#> 10 0103  12/17/92   -11
#> # … with 25 more rows

创建于2023-04-20使用reprex v2.0.2

nxowjjhe

nxowjjhe2#

这将删除n = 1的组,将具有少于5个观察结果的组循环为5个(并保留5个或更多个不变):

library(dplyr)

dat %>%
  filter(n() > 1, .by = id) %>%
  slice(rep(seq(n()), length.out = max(n(), 5)), .by = id)

     id     date   y
1  0102 12/03/91   0
2  0102 05/05/92   9
3  0102 12/03/91   0
4  0102 05/05/92   9
5  0102 12/03/91   0
6  0103 06/22/92   0
7  0103 12/17/92 -11
8  0103 07/14/93 -11
9  0103 06/22/92   0
10 0103 12/17/92 -11
...
lyfkaqu1

lyfkaqu13#

  • base* 中使用splitrep[组合的方法。
DF[unlist(lapply(split(seq_len(nrow(DF)), DF$id), \(i)
          if(length(i) > 1) rep(i, ceiling(5 / length(i))))),]

结果

id     date   y
3    0102 12/03/91   0
4    0102 05/05/92   9
3.1  0102 12/03/91   0
4.1  0102 05/05/92   9
3.2  0102 12/03/91   0
4.2  0102 05/05/92   9
5    0103 06/22/92   0
6    0103 12/17/92 -11
7    0103 07/14/93 -11
5.1  0103 06/22/92   0
6.1  0103 12/17/92 -11
7.1  0103 07/14/93 -11
8    0104 07/28/92   0
9    0104 01/14/93  10
10   0104 08/11/93   9
11   0104 02/03/94   4
12   0104 08/23/94   5
13   0105 09/24/92   0
14   0105 03/05/93  -7
15   0105 10/18/93 -17
16   0105 04/14/94 -13
17   0105 05/31/94 -17
18   0106 01/13/93   0
19   0106 07/27/93   6
20   0106 03/10/94   6
21   0106 09/01/94   1
22   0106 03/09/95   3
23   0107 01/15/93   0
24   0107 07/23/93  -9
25   0107 02/07/94 -13
26   0107 07/28/94 -18
27   0107 02/07/95 -17
28   0108 03/19/93   0
29   0108 10/04/93  -8
30   0108 05/17/94  -8
31   0108 11/15/94 -10
28.1 0108 03/19/93   0
29.1 0108 10/04/93  -8
30.1 0108 05/17/94  -8
31.1 0108 11/15/94 -10

数据

DF <- structure(list(id = c("0101", "0102", "0102", "0103", "0103", 
"0103", "0104", "0104", "0104", "0104", "0104", "0105", "0105", 
"0105", "0105", "0105", "0106", "0106", "0106", "0106", "0106", 
"0107", "0107", "0107", "0107", "0107", "0108", "0108", "0108", 
"0108"), date = c("10/01/91", "12/03/91", "05/05/92", "06/22/92", 
"12/17/92", "07/14/93", "07/28/92", "01/14/93", "08/11/93", "02/03/94", 
"08/23/94", "09/24/92", "03/05/93", "10/18/93", "04/14/94", "05/31/94", 
"01/13/93", "07/27/93", "03/10/94", "09/01/94", "03/09/95", "01/15/93", 
"07/23/93", "02/07/94", "07/28/94", "02/07/95", "03/19/93", "10/04/93", 
"05/17/94", "11/15/94"), y = c(0, 0, 9, 0, -11, -11, 0, 10, 9, 
4, 5, 0, -7, -17, -13, -17, 0, 6, 6, 1, 3, 0, -9, -13, -18, -17, 
0, -8, -8, -10)), row.names = c(1L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 
10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 
23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L), class = "data.frame")
uajslkp6

uajslkp64#

我已经明确地说明了重复,所以它总是一个倍数,即使这会使你超过每个组的确切数量5。使用 data.table 包:

library(data.table)
setDT(df)
df[
    df[, if(.N == 1) NULL
         else if(.N < 5) rep(.I, (5 %/% .N) + 1)
         else .I,  by=id]$V1
]

#        id     date     y
#    <char>   <char> <num>
# 1:   0102 12/03/91     0
# 2:   0102 05/05/92     9
# 3:   0102 12/03/91     0
# 4:   0102 05/05/92     9
# 5:   0102 12/03/91     0
# 6:   0102 05/05/92     9
# 7:   0103 06/22/92     0
# 8:   0103 12/17/92   -11
# 9:   0103 07/14/93   -11
#10:   0103 06/22/92     0
#11:   0103 12/17/92   -11
#12:   0103 07/14/93   -11
# ...

相关问题