整齐,矢量化Fisher.test R

qoefvg9y  于 2023-04-03  发布在  其他
关注(0)|答案(4)|浏览(91)

我想vectorize fisher.test函数,这样我就可以将它并行应用于多个变量。
例如:构建玩具数据集

library(tidyverse)
library(broom)
n=20
outcome <- rbinom(n, size =c(0,1), prob = 0.7)
feature1 <- rbinom(n, size=c(2), prob = 0.5) + 10
feature2 <- rbinom(n, size=c(1), prob = 0.5)+20
df <- tibble(outcome, feature1, feature2)

这是一个功能的预期输出:

df %>%
  select(outcome, feature1) %>%
  count(outcome, feature1) %>%
  pivot_wider(names_from = feature1, values_from = n) %>%
  fisher.test(.) %>%
  tidy()

我尝试了group_modify来将它并行应用于几个不同的变量,但它不起作用:

df %>% 
  pivot_longer(c("feature1", "feature2"), names_to = "variable", values_to = "value") %>%
  group_by(variable) %>%
  group_modify(~count(outcome, value)) %>%
  pivot_wider(names_from = value, values_from = n) %>%
  fisher.test(.) %>%
  tidy()

我得到错误“no method 'count' applicable for an object of class c(“integer ',' numeric ')”
理想情况下,我的最终输出是:

all_pvalues <- tribble(
      ~variable, ~p.value, 
       feature1,   0.805,
       feature2,   0.582)

先谢谢你了,

b4qexyjb

b4qexyjb1#

使用base R

stack(lapply(df[-1], \(x) fisher.test(df$outcome, x)$p.value))[2:1]
       ind    values
1 feature1 0.5939033
2 feature2 0.1576883
tmb3ates

tmb3ates2#

cols = names(df)[grepl("^feature",names(df))]

tibble(
  feature = cols,
  pvalue = t(reframe(df, across(cols, ~fisher.test(outcome,.x)$p.value)))[,1]
)

输出:

feature  pvalue
  <chr>     <dbl>
1 feature1  0.837
2 feature2  0.642
3ks5zfa0

3ks5zfa03#

我终于找到了解决办法:

df %>%
  pivot_longer(-outcome, names_to = "variable", values_to = "value") %>%
  group_by(outcome, variable, value) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  group_by(variable) %>%
  group_modify(~pivot_wider(data = ., names_from = value, values_from = n)) %>%
  replace(is.na(.), 0) %>%
  select(-outcome) %>%
  group_modify(~tidy(fisher.test(x=.))) %>%
  select(variable, p.value) %>%
  ungroup()
bq9c1y66

bq9c1y664#

使用group_splitmap_dfr的一种方法:

library(dplyr)
library(broom)
library(tidyr)
library(purrr)
df  |> 
  pivot_longer(c("feature1", "feature2"), names_to = "variable", values_to = "value") |>
  group_split(variable) |>
  set_names(c("feature1", "feature2")) |> 
  map_dfr(~.x |> count(outcome, value) |>pivot_wider(names_from = value, values_from = n) |> 
            mutate(across(everything(), \(x) replace_na(x,0))) |> 
            fisher.test() |> 
            tidy(), .id = "variable") |> 
  select(variable, p.value)

输出:

# A tibble: 2 × 2
  variable p.value
  <chr>      <dbl>
1 feature1  0.0991
2 feature2  0.438

相关问题