R语言 合并两个数据集,保留所有行差异并添加相似行

zysjyyx4  于 2023-01-22  发布在  其他
关注(0)|答案(3)|浏览(210)

我有两个数据集Data和Data1。我想合并这些数据集,保留所有的差异,同时将所有公共行的数值添加到新表中。有什么简单的工具可以做到这一点吗?

head(Data)
  contig position variantID refAllele altAllele refCount altCount totalCount lowMAPQDepth lowBaseQDepth rawDepth otherBases improperPairs
1   chr1   905373         .         T         C        2        4          6            0             0        6          0             0
2   chr1   911428         .         C         T        1        2          3            0             0        3          0             0
3   chr1   953279         .         T         C      146      126        272            0             0      273          1             0
4   chr1   962184         .         T         C       14       15         29            0             0       29          0             0
5   chr1  1024129         .         T         G        1        0          1            0             0        1          0             0
6   chr1  1039514         .         C         T        1        1          2            0             0        2          0             0
head(Data1)
  contig position variantID refAllele altAllele refCount altCount totalCount lowMAPQDepth lowBaseQDepth rawDepth otherBases improperPairs
1   chr1   905373         .         T         C        2        3          5            0             0        5          0             0
2   chr1   933024         .         C         T        1        0          1            0             0        1          0             0
3   chr1   953279         .         T         C      122      124        246            0             0      248          2             0
4   chr1   962184         .         T         C       17       21         38            0             0       38          0             0
5   chr1  1022518         .         G         T        0        1          1            0             0        1          0             0
6   chr1  1024129         .         T         G        1        2          3            0             0        3          0             0

所需输出示例

contig  position    variantID   refAllele   altAllele   refCount    altCount    totalCount lowMAPQDepth lowBaseQDepth   rawDepth    otherBases  improperPairs
1   chr1    905373  .   T   C   4   7   11  0   0   11  0   0
2   chr1    911428  .   C   T   1   2   3   0   0   3   0   0
2   chr1    933024  .   C   T   1   0   1   0   0   1   0   0
4   chr1    953279  .   T   C   268 150 518 0   0   521 3   0

正如我们在列位置中看到的,站点905373是从列refCount开始添加的,而站点911428和933024对于其数据集都是唯一的,但被插入到新数据集中。它们是创建输出表的一种简单方法吗?

Data <- structure(list(contig = c("chr1", "chr1", "chr1", "chr1", "chr1", 
"chr1"), position = c(905373L, 911428L, 953279L, 962184L, 1024129L, 
1039514L), variantID = c(".", ".", ".", ".", ".", "."), refAllele = c("T", 
"C", "T", "T", "T", "C"), altAllele = c("C", "T", "C", "C", "G", 
"T"), refCount = c(2L, 1L, 146L, 14L, 1L, 1L), altCount = c(4L, 
2L, 126L, 15L, 0L, 1L), totalCount = c(6L, 3L, 272L, 29L, 1L, 
2L), lowMAPQDepth = c(0L, 0L, 0L, 0L, 0L, 0L), lowBaseQDepth = c(0L, 
0L, 0L, 0L, 0L, 0L), rawDepth = c(6L, 3L, 273L, 29L, 1L, 2L), 
    otherBases = c(0L, 0L, 1L, 0L, 0L, 0L), improperPairs = c(0L, 
    0L, 0L, 0L, 0L, 0L)), row.names = c(NA, 6L), class = "data.frame")

Data1 <- structure(list(contig = c("chr1", "chr1", "chr1", "chr1", "chr1", 
"chr1"), position = c(905373L, 933024L, 953279L, 962184L, 1022518L, 
1024129L), variantID = c(".", ".", ".", ".", ".", "."), refAllele = c("T", 
"C", "T", "T", "G", "T"), altAllele = c("C", "T", "C", "C", "T", 
"G"), refCount = c(2L, 1L, 122L, 17L, 0L, 1L), altCount = c(3L, 
0L, 124L, 21L, 1L, 2L), totalCount = c(5L, 1L, 246L, 38L, 1L, 
3L), lowMAPQDepth = c(0L, 0L, 0L, 0L, 0L, 0L), lowBaseQDepth = c(0L, 
0L, 0L, 0L, 0L, 0L), rawDepth = c(5L, 1L, 248L, 38L, 1L, 3L), 
    otherBases = c(0L, 0L, 2L, 0L, 0L, 0L), improperPairs = c(0L, 
    0L, 0L, 0L, 0L, 0L)), row.names = c(NA, 6L), class = "data.frame")
z9smfwbn

z9smfwbn1#

这里有一个可能性:

Data
#>   contig position variantID refAllele altAllele refCount altCount totalCount
#> 1   chr1   905373         .         T         C        2        4          6
#> 2   chr1   911428         .         C         T        1        2          3
#> 3   chr1   953279         .         T         C      146      126        272
#> 4   chr1   962184         .         T         C       14       15         29
#> 5   chr1  1024129         .         T         G        1        0          1
#> 6   chr1  1039514         .         C         T        1        1          2
#>   lowMAPQDepth lowBaseQDepth rawDepth otherBases improperPairs
#> 1            0             0        6          0             0
#> 2            0             0        3          0             0
#> 3            0             0      273          1             0
#> 4            0             0       29          0             0
#> 5            0             0        1          0             0
#> 6            0             0        2          0             0

Data1
#>   contig position variantID refAllele altAllele refCount altCount totalCount
#> 1   chr1   905373         .         T         C        2        3          5
#> 2   chr1   933024         .         C         T        1        0          1
#> 3   chr1   953279         .         T         C      122      124        246
#> 4   chr1   962184         .         T         C       17       21         38
#> 5   chr1  1022518         .         G         T        0        1          1
#> 6   chr1  1024129         .         T         G        1        2          3
#>   lowMAPQDepth lowBaseQDepth rawDepth otherBases improperPairs
#> 1            0             0        5          0             0
#> 2            0             0        1          0             0
#> 3            0             0      248          2             0
#> 4            0             0       38          0             0
#> 5            0             0        1          0             0
#> 6            0             0        3          0             0

aggregate(. ~ contig +  position + variantID + refAllele + altAllele, rbind(Data, Data1), sum)
#>   contig position variantID refAllele altAllele refCount altCount totalCount
#> 1   chr1   905373         .         T         C        4        7         11
#> 2   chr1   953279         .         T         C      268      250        518
#> 3   chr1   962184         .         T         C       31       36         67
#> 4   chr1  1024129         .         T         G        2        2          4
#> 5   chr1   911428         .         C         T        1        2          3
#> 6   chr1   933024         .         C         T        1        0          1
#> 7   chr1  1039514         .         C         T        1        1          2
#> 8   chr1  1022518         .         G         T        0        1          1
#>   lowMAPQDepth lowBaseQDepth rawDepth otherBases improperPairs
#> 1            0             0       11          0             0
#> 2            0             0      521          3             0
#> 3            0             0       67          0             0
#> 4            0             0        4          0             0
#> 5            0             0        3          0             0
#> 6            0             0        1          0             0
#> 7            0             0        2          0             0
#> 8            0             0        1          0             0
xt0899hw

xt0899hw2#

你需要这样的东西:

library(dplyr)

bind_rows(Data, Data1) %>% as_tibble() %>% 
  group_by(contig, position, refAllele, altAllele) %>% 
  summarise(across(-c(variantID), ~sum(., na.rm = TRUE)))
contig position refAllele altAllele refCount altCount totalC…¹ lowMA…² lowBa…³ rawDe…⁴ other…⁵ impro…⁶
  <chr>     <int> <chr>     <chr>        <int>    <int>    <int>   <int>   <int>   <int>   <int>   <int>
1 chr1     905373 T         C                4        7       11       0       0      11       0       0
2 chr1     911428 C         T                1        2        3       0       0       3       0       0
3 chr1     933024 C         T                1        0        1       0       0       1       0       0
4 chr1     953279 T         C              268      250      518       0       0     521       3       0
5 chr1     962184 T         C               31       36       67       0       0      67       0       0
6 chr1    1022518 G         T                0        1        1       0       0       1       0       0
7 chr1    1024129 T         G                2        2        4       0       0       4       0       0
8 chr1    1039514 C         T                1        1        2       0       0       2       0       0
# … with abbreviated variable names ¹​totalCount, ²​lowMAPQDepth, ³​lowBaseQDepth, ⁴​rawDepth, ⁵​otherBases,
#   ⁶​improperPairs
tquggr8v

tquggr8v3#

这里是另一个使用join/merge的选项,我将演示使用dplyr,尽管它在base R中是可行的(需要做更多的工作):

library(dplyr)
full_join(Data, Data1, by = c("contig", "position", "variantID", "refAllele", "altAllele"), 
          suffix = c("", ".y")) %>%
  mutate(
    across(
      where(is.numeric),
      ~ rowSums(cbind(., cur_data()[[ paste0(cur_column(), ".y") ]]), na.rm = TRUE)
    )
  ) %>%
  select(-ends_with(".y"))
#   contig position variantID refAllele altAllele refCount altCount totalCount lowMAPQDepth lowBaseQDepth rawDepth otherBases improperPairs
# 1   chr1   905373         .         T         C        4        7         11            0             0       11          0             0
# 2   chr1   911428         .         C         T        1        2          3            0             0        3          0             0
# 3   chr1   953279         .         T         C      268      250        518            0             0      521          3             0
# 4   chr1   962184         .         T         C       31       36         67            0             0       67          0             0
# 5   chr1  1024129         .         T         G        2        2          4            0             0        4          0             0
# 6   chr1  1039514         .         C         T        1        1          2            0             0        2          0             0
# 7   chr1   933024         .         C         T        1        0          1            0             0        1          0             0
# 8   chr1  1022518         .         G         T        0        1          1            0             0        1          0             0

我加入了这些字段,因为它们是字符,在我看来(不知情),如果它们不同,您不会想尝试聚合。

相关问题