聚合列-如何处理不均匀的 Dataframe

vs91vp4v  于 2023-02-20  发布在  其他
关注(0)|答案(4)|浏览(167)

我想每隔三行对Chr进行一次聚合(三行之和)。但是,由于df不能被3整除,我不确定如何处理最后剩下的行,可能只有1行或2行。如果剩下两行,我想只对这两行求和。

  • 输入
data.frame(Chr = c("chr1","chr1","chr1","chr1","chr1","chr2","chr2","chr2","chr2","chr2","chr3"),
           value = c(1,3,1,3,5,6,3,1,3,5,0),
           seq = c(1,2,3,4,5,1,2,3,4,5,6))
  • 输出(使用dplyrmutate,保留所有列)
data.frame(Chr = c("chr1","chr1","chr1","chr1","chr1","chr2","chr2","chr2","chr2","chr2","chr3"),
           value = c(1,3,1,3,5,6,3,1,3,5,0),
           seq = c(1,2,3,4,5,1,2,3,4,5,6),
           agg = c(5,5,5,8,8,10,10,10,8,8,8))
laik7k3q

laik7k3q1#

dplyrave选项

library(dplyr) # >= 1.1.0
df1 %>% 
  mutate(agg = ave(value, as.integer(gl(n(), 3, n())), FUN = sum), .by = Chr)
  • 输出
Chr value seq agg
1  chr1     1   1   5
2  chr1     3   2   5
3  chr1     1   3   5
4  chr1     3   4   8
5  chr1     5   5   8
6  chr2     6   1  10
7  chr2     3   2  10
8  chr2     1   3  10
9  chr2     3   4   8
10 chr2     5   5   8
11 chr3     0   6   0

或使用data.table

library(data.table)
 setDT(df1)[,  agg := .SD[, rep(sum(value), .N),
    as.integer(gl(.N, 3, .N))]$V1, Chr]
  • 输出
> df1
     Chr value seq agg
 1: chr1     1   1   5
 2: chr1     3   2   5
 3: chr1     1   3   5
 4: chr1     3   4   8
 5: chr1     5   5   8
 6: chr2     6   1  10
 7: chr2     3   2  10
 8: chr2     1   3  10
 9: chr2     3   4   8
10: chr2     5   5   8
11: chr3     0   6   0
holgip5t

holgip5t2#

可以使用(row_number()-1) %/% 3按3个观测进行分组。

library(dplyr)

df %>%
  mutate(Grp = (row_number()-1) %/% 3 + 1, .by = Chr) %>%
  mutate(agg = sum(value), .by = c(Chr, Grp))

#     Chr value seq Grp agg
# 1  chr1     1   1   1   5
# 2  chr1     3   2   1   5
# 3  chr1     1   3   1   5
# 4  chr1     3   4   2   8
# 5  chr1     5   5   2   8
# 6  chr2     6   1   1  10
# 7  chr2     3   2   1  10
# 8  chr2     1   3   1  10
# 9  chr2     3   4   2   8
# 10 chr2     5   5   2   8
# 11 chr3     0   6   1   0

如果不需要Grp列,可以使用select(-Grp)删除它。

nvbavucw

nvbavucw3#

将行号(Chr组内)除以3,然后将此比率四舍五入,前3个值为1,后3个值为2,依此类推。然后,您可以按Chr和此变量进行分组,以计算总和:

d %>% 
  group_by(Chr) %>%
  mutate(Chr_group = ceiling(row_number()/3)) %>% 
  group_by(Chr, Chr_group) %>%
  mutate(agg = sum(value)) %>%
  ungroup()
nbysray5

nbysray54#

下面是另一种类似的方法:尝试新的.by参数。我尝试在最后一次变异时也使用它,但无法将cumsum(..与.by组合在一起:

library(dplyr)

df %>% 
  mutate(group = as.integer(gl(n(),3,n())), .by=Chr) %>% 
  mutate(id = row_number(), .by = c(Chr, group)) %>% 
  group_by(Chr, sumgroup = cumsum(id == 1)) %>% 
  mutate(agg = sum(value)) %>% 
  ungroup() %>% 
  select(Chr, value, seq, agg)
Chr   value   seq   agg
   <chr> <dbl> <dbl> <dbl>
 1 chr1      1     1     5
 2 chr1      3     2     5
 3 chr1      1     3     5
 4 chr1      3     4     8
 5 chr1      5     5     8
 6 chr2      6     1    10
 7 chr2      3     2    10
 8 chr2      1     3    10
 9 chr2      3     4     8
10 chr2      5     5     8
11 chr3      0     6     0

相关问题