R语言 迭代每行以获得行值与另一个数据框架df2的列名之间的匹配项,然后对df2进行子集

b4qexyjb  于 2023-07-31  发布在  其他
关注(0)|答案(2)|浏览(86)

对于hallmark.df的每一行,如果rownames(all.deg)匹配给定行的任何值,则检索all.deg的相应行。我还希望只保留hallmark.df Dataframe 中非NA列多于25个且列少于500个的行

subset.df <- by(hallmark.df, seq_len(nrow(hallmark.df)), function(row) 
  ifelse(all.deg[rownames(all.deg) %in% hallmark.df,]))

字符串
输入:
all.deg

> dput(all.deg[1:5,1:5])
structure(c(16.0169585624867, 14.3983080662428, 12.7844219145156, 
12.6674945373237, 13.8584047354367, 13.563719599839, 13.6166993468069, 
12.9748157402651, 12.7386065050292, 12.2201616898331, 11.3657998135948, 
11.8253392160132, 12.1132082166185, 11.5123143882139, 10.2967924742924, 
13.7513874043739, 13.2403954818698, 12.4196432226432, 12.4676109090624, 
12.1390647972695, 12.3013113392588, 12.4867673484914, 11.3693921877853, 
10.6359730348998, 10.0122721528039), dim = c(5L, 5L), dimnames = list(
    c("JUNB", "ALDOA", "HLA.A", "THBD", "CD74"), c("TCGA.2K.A9WE.01", 
    "TCGA.2Z.A9J1.01", "TCGA.2Z.A9J3.01", "TCGA.2Z.A9J6.01", 
    "TCGA.2Z.A9J7.01")))

> dput(hallmark.df[1:5,1:5])
structure(list(V3 = c("JUNB", "PGK1", "FDPS", "ARHGEF2", "CD74"
), V4 = c("CXCL2", "PDK1", "CYP51A1", "CLASP1", "CTNNB1"), V5 = c("ATF3", 
"GBE1", "IDI1", "KIF11", "JAG2"), V6 = c("NFKBIA", "PFKL", "FDFT1", 
"KIF23", "NOTCH1"), V7 = c("ALDOA", "JUNB", "DHCR7", "ALS2", 
"DLL1")), row.names = c("HALLMARK_TNFA_SIGNALING_VIA_NFKB", "HALLMARK_HYPOXIA", 
"HALLMARK_CHOLESTEROL_HOMEOSTASIS", "HALLMARK_MITOTIC_SPINDLE", 
"HALLMARK_WNT_BETA_CATENIN_SIGNALING"), class = "data.frame")


预期产量:
HALLMARK_TNFA_SIGNALING_VIA_NFKB
| | TCGA.2K.A9WE.01 | TCGA.2Z.A9J1.01 | TCGA.2Z.A9J3.01 | TCGA.2Z.A9J6.01 | TCGA.2Z.A9J7.01 |
| --|--|--|--|--| ------------ |
| 16.0169585624867| 13.563719599839| 11.3657998135948| 13.7513874043739| 12.3013113392588| 12.3013113392588 |
| 13.8584047354367| 12.2201616898331| 10.2967924742924| 12.1390647972695| 10.0122721528039| 10.0122721528039 |
HALLMARK_WNT_BETA_CATENIN_SIGNALING
| | TCGA.2K.A9WE.01 | TCGA.2Z.A9J1.01 | TCGA.2Z.A9J3.01 | TCGA.2Z.A9J6.01 | TCGA.2Z.A9J7.01 |
| --|--|--|--|--| ------------ |
| 14.3983080662428| 13.6166993468069| 11.8253392160132| 13.2403954818698| 12.4867673484914| 12.4867673484914 |
| 16.0169585624867| 13.563719599839| 11.3657998135948| 13.7513874043739| 12.3013113392588| 12.3013113392588 |

df.list <- list(HALLMARK_TNFA_SIGNALING_VIA_NFKB, HALLMARK_WNT_BETA_CATENIN_SIGNALING)


相关问题:How to subset a dataframe based on matches to another dataframe?

5lhxktic

5lhxktic1#

我们可以将行名称转换为列,然后通过{tidyverse}很容易就可以完成:

all.deg <- structure(c(16.0169585624867, 14.3983080662428, 12.7844219145156, 
                       12.6674945373237, 13.8584047354367, 13.563719599839, 13.6166993468069, 
                       12.9748157402651, 12.7386065050292, 12.2201616898331, 11.3657998135948, 
                       11.8253392160132, 12.1132082166185, 11.5123143882139, 10.2967924742924, 
                       13.7513874043739, 13.2403954818698, 12.4196432226432, 12.4676109090624, 
                       12.1390647972695, 12.3013113392588, 12.4867673484914, 11.3693921877853, 
                       10.6359730348998, 10.0122721528039), dim = c(5L, 5L), dimnames = list(
                         c("JUNB", "ALDOA", "HLA.A", "THBD", "CD74"), c("TCGA.2K.A9WE.01", 
                                                                        "TCGA.2Z.A9J1.01", "TCGA.2Z.A9J3.01", "TCGA.2Z.A9J6.01", 
                                                                        "TCGA.2Z.A9J7.01")))
hallmark.df <- structure(list(V3 = c("JUNB", "PGK1", "FDPS", "ARHGEF2", "CD74"), 
                              V4 = c("CXCL2", "PDK1", "CYP51A1", "CLASP1", "CTNNB1"), 
                              V5 = c("ATF3", "GBE1", "IDI1", "KIF11", "JAG2"),
                              V6 = c("NFKBIA", "PFKL", "FDFT1", "KIF23", "NOTCH1"),
                              V7 = c("ALDOA", "JUNB", "DHCR7", "ALS2", "DLL1")), 
                         row.names = c("HALLMARK_TNFA_SIGNALING_VIA_NFKB", "HALLMARK_HYPOXIA", 
                                       "HALLMARK_CHOLESTEROL_HOMEOSTASIS", "HALLMARK_MITOTIC_SPINDLE", 
                                       "HALLMARK_WNT_BETA_CATENIN_SIGNALING"), class = "data.frame")

library(tidyverse)
hallmark.df |> 
  as_tibble(rownames = "row") |> 
  pivot_longer(-row) |> 
  inner_join(
    all.deg |> 
      as_tibble(rownames = "value"),
    by = "value"
  ) |> 
  split(~row) |> 
  map(
    ~ . |> 
      select(-row, -name) |> 
      column_to_rownames("value")
  )
#> $HALLMARK_HYPOXIA
#>      TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
#> JUNB        16.01696        13.56372         11.3658        13.75139
#>      TCGA.2Z.A9J7.01
#> JUNB        12.30131
#> 
#> $HALLMARK_TNFA_SIGNALING_VIA_NFKB
#>       TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
#> JUNB         16.01696        13.56372        11.36580        13.75139
#> ALDOA        14.39831        13.61670        11.82534        13.24040
#>       TCGA.2Z.A9J7.01
#> JUNB         12.30131
#> ALDOA        12.48677
#> 
#> $HALLMARK_WNT_BETA_CATENIN_SIGNALING
#>      TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
#> CD74         13.8584        12.22016        10.29679        12.13906
#>      TCGA.2Z.A9J7.01
#> CD74        10.01227

字符串
创建于2023-07-18,使用reprex v2.0.2

lskq00tm

lskq00tm2#

这可能不是解决问题的“最佳”方法,但这里有一个潜在的解决方案:

library(tidyverse)
all.deg <- structure(c(16.0169585624867, 14.3983080662428, 12.7844219145156, 
                       12.6674945373237, 13.8584047354367, 13.563719599839, 13.6166993468069, 
                       12.9748157402651, 12.7386065050292, 12.2201616898331, 11.3657998135948, 
                       11.8253392160132, 12.1132082166185, 11.5123143882139, 10.2967924742924, 
                       13.7513874043739, 13.2403954818698, 12.4196432226432, 12.4676109090624, 
                       12.1390647972695, 12.3013113392588, 12.4867673484914, 11.3693921877853, 
                       10.6359730348998, 10.0122721528039), dim = c(5L, 5L), dimnames = list(
                         c("JUNB", "ALDOA", "HLA.A", "THBD", "CD74"), c("TCGA.2K.A9WE.01", 
                                                                        "TCGA.2Z.A9J1.01", "TCGA.2Z.A9J3.01", "TCGA.2Z.A9J6.01", 
                                                                        "TCGA.2Z.A9J7.01")))

hallmark.df <- structure(list(V3 = c("JUNB", "PGK1", "FDPS", "ARHGEF2", "CD74"
), V4 = c("CXCL2", "PDK1", "CYP51A1", "CLASP1", "CTNNB1"), V5 = c("ATF3", 
                                                                  "GBE1", "IDI1", "KIF11", "JAG2"), V6 = c("NFKBIA", "PFKL", "FDFT1", 
                                                                                                           "KIF23", "NOTCH1"), V7 = c("ALDOA", "JUNB", "DHCR7", "ALS2", 
                                                                                                                                      "DLL1")), row.names = c("HALLMARK_TNFA_SIGNALING_VIA_NFKB", "HALLMARK_HYPOXIA", 
                                                                                                                                                              "HALLMARK_CHOLESTEROL_HOMEOSTASIS", "HALLMARK_MITOTIC_SPINDLE", 
                                                                                                                                                              "HALLMARK_WNT_BETA_CATENIN_SIGNALING"), class = "data.frame")

output <- list()
for (i in seq_along(hallmark.df)) {
  output[[rownames(hallmark.df)[i]]] <- all.deg[rownames(all.deg) %in% hallmark.df[,i],]
}

# Remove empty dataframes from the list
clean_list <- output[which(lapply(output, nrow) != 0)]

# export the dataframes to the global env
list2env(clean_list, envir = .GlobalEnv)
#> <environment: R_GlobalEnv>

ls(pattern = "HALLMARK*")
#> [1] "HALLMARK_TNFA_SIGNALING_VIA_NFKB"    "HALLMARK_WNT_BETA_CATENIN_SIGNALING"

HALLMARK_TNFA_SIGNALING_VIA_NFKB
#>      TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
#> JUNB        16.01696        13.56372        11.36580        13.75139
#> CD74        13.85840        12.22016        10.29679        12.13906
#>      TCGA.2Z.A9J7.01
#> JUNB        12.30131
#> CD74        10.01227
HALLMARK_WNT_BETA_CATENIN_SIGNALING
#>       TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J6.01
#> JUNB         16.01696        13.56372        11.36580        13.75139
#> ALDOA        14.39831        13.61670        11.82534        13.24040
#>       TCGA.2Z.A9J7.01
#> JUNB         12.30131
#> ALDOA        12.48677

字符串
创建于2023-07-18,使用reprex v2.0.2

编辑1

要保留hallmark.df数据框中非NA列超过25列且少于500列的行,您可以用途:

hallmark %>%
  filter(rowSums(!is.na(.)) >= 25 | rowSums(!is.na(.)) <= 500)

相关问题