在R中按行名称的两个数据框架中列之间的系统斯皮尔曼相关性

nzk0hqpo  于 2023-10-13  发布在  其他
关注(0)|答案(1)|浏览(72)

在r中有两个data.frames

set.seed(12345)
  df1 <- data.frame(a=rnorm(20,0,0.4),
                      b=rnorm(20,0.3,0.8),
                      c=rnorm(20,-0.1,0.6),
                      d=rnorm(20,-0.23,0.3),
                      e=rnorm(20,0.2,0.4))
  library(purrr)
  df1 <- as.data.frame(map_df(df1, function(x) {x[sample(c(TRUE, NA), prob = c(0.8, 0.2), 
       size = length(x), replace = TRUE)]}))
  rownames(df1) <- sample(LETTERS, 20, replace=FALSE)
  df2 <- data.frame(one=rnorm(23,6,2),
                      two=rnorm(23,8,4),
                      three=rnorm(23,12,5),
                      four=rnorm(23,4,0.4),
                      five=rnorm(23,3,0.2))
  df2 <- as.data.frame(map_df(df2, function(x) {x[sample(c(TRUE, NA), prob = c(0.7, 0.3), 
       size = length(x), replace = TRUE)]}))
  rownames(df2) <- sample(LETTERS, 23, replace=FALSE)

如何使用匹配的行id系统地确定两个data.frames中每列之间的斯皮尔曼相关性和p值?因此,在df1中的列“a”和df2中的列“one”之间,在df1中的列“a”和df2中的列“two”之间,...,在df1中的列“e”和df2中的列“five”之间,创建一个新的数据.frame。
预期结果:

df3 <- data.frame(letter=c(rep("a", 5), rep("b", 2),"..."),
                      number=c("one","two","three","four","five","one","two", "..."),
                      Spearman.r=c(-0.6352, 0.0182, 0.5944, 0.3846, -0.6606, 0.1154, 0.2364, "..."),
                      p.value=c(0.0171, 0.9730, 0.0457, 0.2183, 0.0438, 0.7097, 0.4854, "..."))
  • 我的尝试(不成功):*

我试过了,但结果与预期不同。我不知道怎么解决这个问题!!!

library(dplyr)
 # Create empty data.frame for results
 df3 <- data.frame(letter = character(),
                   number = character(),
                   Spearman.r = numeric(),
                   p.value = numeric(),
                   stringsAsFactors = FALSE)
 # Loop through each column in df1 and df2
 for (col1 in colnames(df1)) {
   for (col2 in colnames(df2)) {
 # Check for missing values in both 'x' and 'y'
     valid_rows <- !is.na(df1[[col1]]) & !is.na(df2[[col2]])
     x <- df1[[col1]][valid_rows]
     y <- df2[[col2]][valid_rows]
 # Calculate Spearman correlation and p-value
     if (length(x) > 1 & length(y) > 1) {
       result <- cor.test(x, y, method = "spearman")
 # Append the results to df3
       df3 <- df3 %>%
         add_row(letter = col1,
                 number = col2,
                 Spearman.r = result$estimate,
                 p.value = result$p.value)
     }
   }
 }

我认为上面的问题是行名称被忽略/不匹配。我们该怎么解决这个问题???

mspsb9vt

mspsb9vt1#

我会把所需统计数据的元素名称放在一个向量中,Mapcor.test在相应的列上,rbind是结果-它更简洁。

sts <- c('estimate', 'p.value')
Map(cor.test, df1, df2[1:20, ]) |> lapply(`[`, sts) |>
  sapply(do.call, what=rbind) |> t() |> `colnames<-`(sts)
#      estimate    p.value
# a  0.31461249 0.21873250
# b  0.03575649 0.90341200
# c  0.33568792 0.41627099
# d  0.53666496 0.07202434
# e -0.34248509 0.27582473

注意,这两列应该有相同的长度,我相应地对df 2进行了子集化。

  • 数据:*
> dput(df1)
structure(list(a = c(0.234211527137542, NA, -0.0437213258724216, 
-0.181398869385105, 0.242354982336157, -0.727182387081492, 0.252039420427356, 
-0.110473642090086, -0.113663897577348, -0.367728800989651, -0.0464991225408008, 
0.726924817481687, 0.148251145703182, 0.208086583021983, NA, 
0.326759935808233, -0.354543008497285, -0.132631035977021, 0.448285060667822, 
0.119489479706917), b = c(0.923697539644259, 1.46462806598149, 
-0.215462743385042, -0.942509924183749, NA, 1.74407801504866, 
NA, 0.796303841038738, 0.789698794120679, 0.170151218465499, 
0.949498542843088, 2.05746683707803, 1.93935226992495, 1.60595651158437, 
0.503416954251244, 0.692950623418047, NA, NA, 1.71418708069837, 
0.320640838918247), c = c(NA, NA, NA, 0.462284324109745, 0.412671032198332, 
0.776437641862453, NA, 0.240441952054689, 0.249912592061411, 
-0.884079300078655, -0.424231644317263, NA, -0.0678458377855273, 
0.110997704333142, NA, NA, 0.31470276372493, 0.394277197223652, 
1.18703901228817, -1.50816638701802), d = c(NA, NA, NA, NA, -0.40606387831613, 
-0.77971319186282, 0.0364418297956053, 0.248046541706882, NA, 
-0.618701504088447, -0.213615327401187, -0.465394811957463, -0.54480584568688, 
NA, 0.190811614860229, 0.0527802552074576, 0.0178774861457288, 
-0.473462147057324, -0.0871255157739038, NA), e = c(0.458153229682431, 
0.617257420620576, 0.0782523549225354, 1.19084436678519, 0.588488269214978, 
0.946839673826688, 0.468816987605915, 0.0768186476650648, 0.414609486684349, 
0.52994802614067, -0.185560591605521, NA, 0.954778776810265, 
0.0432722512019271, NA, 0.474932840222817, NA, 1.06308792660257, 
-0.0399190255299342, -0.0778186773083004)), row.names = c("B", 
"S", "H", "R", "E", "Z", "K", "N", "D", "W", "Q", "X", "F", "J", 
"T", "U", "G", "A", "M", "Y"), class = "data.frame")
> dput(df2)
structure(list(one = c(5.51605196074149, 5.0365328649962, 4.01639427148655, 
5.43870172239114, 7.26603468836034, 3.52036331378428, 9.52862813798864, 
5.95264022369398, 6.39984096765487, 8.69438555528002, 6.0721469755994, 
7.64916226113347, NA, 6.96190031053536, NA, 6.80272997501121, 
6.43035434149409, 2.3685752941944, 4.17652115153425, 5.90191061829275, 
5.18922504663637, 8.26076359569371, NA), two = c(8.30567007619257, 
NA, 9.49648433374505, 7.3163837769897, NA, 10.1740884317625, 
5.97925598859351, 11.1471831576081, 9.20379760643377, 13.2408956269895, 
NA, 11.4034417504577, 6.22572812762124, 6.21290084956168, 8.0532201671046, 
2.2554171750917, NA, 8.97408706250136, 12.2334489333839, 11.3253952716601, 
NA, 1.03314762217058, NA), three = c(12.4855211233804, 11.6163315273091, 
16.9597533885087, NA, 10.592101558378, NA, NA, NA, 15.3014668907241, 
3.38898796427443, 1.32686975825689, 12.344727990685, NA, 0.54977910270623, 
11.2490485589584, 10.6560910360448, 20.9566602057366, 15.3613402084905, 
NA, 12.0609125306219, 19.670584321922, NA, 12.3921876637446), 
    four = c(3.68829557182812, NA, NA, NA, 3.81284465221339, 
    NA, 3.74330545689202, 4.25106872929897, 4.09933205072919, 
    3.71996967331032, 3.77303936595233, 3.89544242964422, 3.57444598568537, 
    NA, 4.30844149587023, 5.09896141689949, 3.96642609716967, 
    4.21742705370011, 4.30114448425416, NA, 4.40044793895629, 
    NA, 3.42629987702757), five = c(2.94693903674438, NA, 2.91699579425768, 
    2.90808486475521, NA, 2.76829217330492, 3.14217799968969, 
    3.25352035044765, 2.97136978884987, NA, 3.29657823686164, 
    2.96748221801475, NA, 3.09660797922777, NA, 2.86728525217144, 
    2.87307002115739, NA, 3.11537007577426, 2.57738392585485, 
    NA, 3.22942543805793, 3.00295872971124)), row.names = c("B", 
"J", "U", "X", "A", "P", "I", "F", "R", "Z", "G", "K", "H", "S", 
"C", "O", "N", "Y", "T", "L", "M", "W", "D"), class = "data.frame")

相关问题