消除多个 Dataframe 中的重复行

ymdaylpp  于 2023-01-22  发布在  其他
关注(0)|答案(4)|浏览(220)

我有一个关于数据排序的问题。在我的数据集中,有5个细胞簇。每个簇都有一个富集基因列表,作为单独的 Dataframe 。但是,有些基因在2个或更多簇中富集,我想删除这些基因。我该怎么做?
在下面的简化示例中,应删除名为“MYH6”、“MYOD”和“GAPDH”的行,因为它们位于2个或更多 Dataframe 中。
我试过list(),但不知道下一步该怎么做。unique()不起作用。

cluster1 <- data.frame(logFC  = c("1", "0.5", "0.7","0.5"))
rownames(cluster1) <- c("MYH6","ACTA1","TNNT2","GAPDH")

cluster2 <- data.frame(logFC  = c("1", "0.8", "0.6","1.2"))
rownames(cluster2) <- c("MYH6","MYOD","PECAM1","GAPDH")

cluster3 <- data.frame(logFC  = c("2", "0.9", "1.5","0.7"))
rownames(cluster3) <- c("MYL7","MYOD","CD34","GAPDH")

先谢谢你。

fjaof16o

fjaof16o1#

1.查找名称为“cluster”的所有对象并将它们存储到列表中。
1.从每个聚类中提取行名称,取消列表,并查找重复的基因。
1.过滤掉每个簇上那些行名称与重复基因匹配的行。

clust.list <- mget(ls(pattern = 'cluster'))
gene.rm <- names(which(table(unlist(lapply(clust.list, row.names))) > 1))
res.list <- lapply(clust.list, \(x) x[!row.names(x) %in% gene.rm, , drop = FALSE])

res.list
# $cluster1
#       logFC
# ACTA1   0.5
# TNNT2   0.7
# 
# $cluster2
#        logFC
# PECAM1   0.6
# 
# $cluster3
#      logFC
# MYL7     2
# CD34   1.5

1.如果要将res.list中的所有群集覆盖到全局环境

list2env(res.list, .GlobalEnv)
w41d8nur

w41d8nur2#

这里是一个解决方案过滤复制:

library(dplyr)

cluster1 <- data.frame(logFC  = c("1", "0.5", "0.7","0.5"))
rownames(cluster1) <- c("MYH6","ACTA1","TNNT2","GAPDH")

cluster2 <- data.frame(logFC  = c("1", "0.8", "0.6","1.2"))
rownames(cluster2) <- c("MYH6","MYOD","PECAM1","GAPDH")

cluster3 <- data.frame(logFC  = c("2", "0.9", "1.5","0.7"))
rownames(cluster3) <- c("MYL7","MYOD","CD34","GAPDH")

row_names <- unlist(list(rownames(cluster1), rownames(cluster2), rownames(cluster3)))
row_names <- row_names[which(!(duplicated(row_names) | duplicated(row_names, fromLast = TRUE)))]

cluster1 <- cluster1 %>% filter(row.names(cluster1) %in% row_names)
cluster2 <- cluster2 %>% filter(row.names(cluster2) %in% row_names)
cluster3 <- cluster3 %>% filter(row.names(cluster3) %in% row_names)

clusters <- rbind(cluster1, cluster2, cluster3)

输出:

> clusters
       logFC
ACTA1    0.5
TNNT2    0.7
PECAM1   0.6
MYL7       2
CD34     1.5

如果您只需要原始的三个 Dataframe ,就不要应用最后的rbind

> cluster1
      logFC
ACTA1   0.5
TNNT2   0.7
> cluster2
      logFC
PECAM1  0.6
> cluster3
     logFC
MYL7     2
CD34   1.5
oxosxuxt

oxosxuxt3#

请尝试下面的代码,其中我更新了各个 Dataframe 以具有变量grp,因此对于cluster1grp=1grp=1,对于cluster2grp=2grp=2,依此类推。然后,我们可以使用grp来分隔 Dataframe

数据

cluster1 <- data.frame(logFC  = c("1", "0.5", "0.7","0.5"), grp=1)
rownames(cluster1) <- c("MYH6","ACTA1","TNNT2","GAPDH")

cluster2 <- data.frame(logFC  = c("1", "0.8", "0.6","1.2"), grp=2)
rownames(cluster2) <- c("MYH6","MYOD","PECAM1","GAPDH")

cluster3 <- data.frame(logFC  = c("2", "0.9", "1.5","0.7"), grp=3)
rownames(cluster3) <- c("MYL7","MYOD","CD34","GAPDH")
library(tidyverse)

cluster_all <- bind_rows(cluster1,cluster2,cluster3) %>% rownames_to_column() %>% 
  mutate(rowname=str_replace(rowname,'\\.+\\d+','')) %>% arrange(rowname) %>% 
  group_by(rowname) %>% mutate(cnt=n()) %>% filter(cnt<=1)

for (i in c('cluster1','cluster2','cluster3')){
  j <- match(i ,c('cluster1','cluster2','cluster3'))
  cluster_allx <- cluster_all %>% filter(grp==j)
  assign(i,cluster_allx, envir = .GlobalEnv)
}

创建于2023年1月19日,使用reprex v2.0.2

# A tibble: 2 × 4
# Groups:   rowname [2]
  rowname logFC   grp   cnt
  <chr>   <chr> <dbl> <int>
1 ACTA1   0.5       1     1
2 TNNT2   0.7       1     1

# A tibble: 1 × 4
# Groups:   rowname [1]
  rowname logFC   grp   cnt
  <chr>   <chr> <dbl> <int>
1 PECAM1  0.6       2     1

# A tibble: 2 × 4
# Groups:   rowname [2]
  rowname logFC   grp   cnt
  <chr>   <chr> <dbl> <int>
1 CD34    1.5       3     1
2 MYL7    2         3     1
nhhxz33t

nhhxz33t4#

带Map和拆分的Tidyverse解决方案:

library(purrr, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)
library(tibble, warn.conflicts = FALSE)
library(magrittr, warn.conflicts = FALSE)

cluster1 <- data.frame(logFC  = c("1", "0.5", "0.7","0.5"))
rownames(cluster1) <- c("MYH6","ACTA1","TNNT2","GAPDH")

cluster2 <- data.frame(logFC  = c("1", "0.8", "0.6","1.2"))
rownames(cluster2) <- c("MYH6","MYOD","PECAM1","GAPDH")

cluster3 <- data.frame(logFC  = c("2", "0.9", "1.5","0.7"))
rownames(cluster3) <- c("MYL7","MYOD","CD34","GAPDH")

# Get all dataframes
ls(pattern = 'cluster') %>%
  # Add their names
  set_names(., .) %>%
  # Load them
  map(get) %>%
  # Move rownames to column
  map(rownames_to_column) %>%
  # Add column for which dataset each row comes from
  imap(mutate) %>% 
  map(rename, dataframe = last_col()) %>% 
  # Bind them as a single df
  bind_rows() %>% 
  # Count duplicated rows
  add_count(rowname) %>% 
  # Only keep unique rows
  filter(n == 1) %>% 
  # Split data back to list
  split.data.frame(.$dataframe) %>% 
  # Put column back to rownames
  map(~ set_rownames(.x, .x$rowname)) %>% 
  # Remove auxilary columns
  map(select, -c(dataframe, n, rowname)) %>% 
  # Overwrite original data
  list2env(.GlobalEnv)
#> <environment: R_GlobalEnv>

# Print results
ls(pattern = 'cluster') %>% 
  set_names(., .) %>%
  map(get) 
#> $cluster1
#>       logFC
#> ACTA1   0.5
#> TNNT2   0.7
#> 
#> $cluster2
#>        logFC
#> PECAM1   0.6
#> 
#> $cluster3
#>      logFC
#> MYL7     2
#> CD34   1.5

创建于2023年1月19日,使用reprex v2.0.2

相关问题