R语言 总结每组显著上调和下调基因的数量

hgncfbus  于 2023-04-18  发布在  其他
关注(0)|答案(1)|浏览(234)

在差异表达值的数据框架中,对每组显著上调和下调的基因进行计数。显著性应通过FDR(假发现率=来自Benjamini的调整p值)和倍数变化来定义。结果应为每组上调和下调的曲线图。(甜蜜的奖励:在图中显示不同的Fc水平(例如:0.5,1,2,4,〉4).我的解决方案似乎太复杂了,一定有更简单的方法。

示例数据

# create dex df
gene_creator <- paste("gene",1:1000,sep="")
genes = sample(gene_creator,100)

dex_A <- data.frame(
  gene = genes,
  group = "group_A",
  logFC = sample(c(-5:5), replace=T, size=100),
  FDR = sample(c(0.01,1), replace=T, size=100)
)

dex_B <- data.frame(
  gene = genes,
  group = "group_B",
  logFC = sample(c(-5:5), replace=T, size=100),
  FDR = sample(c(0.01,1), replace=T, size=100)
)

dex_C <- data.frame(
  gene = genes,
  group = "group_C",
  logFC = sample(c(-5:5), replace=T, size=100),
  FDR = sample(c(0.01,1), replace=T, size=100)
)

dex_D <- data.frame(
  gene = genes,
  group = "group_D",
  logFC = sample(c(-5:5), replace=T, size=100),
  FDR = sample(c(0.01,1), replace=T, size=100)
)

dex_df <- rbind(dex_A, dex_B, dex_C, dex_D)

溶液

library("tidyverse")

# FC up
dex_up <- dex_df %>% 
  group_by(group) %>%
  filter(FDR <= 0.05) %>% 
  filter(logFC > 0.5 ) %>%
  summarise(n_up = n())

# Fc down
dex_down <- dex_df %>% 
  group_by(group) %>%
  filter(FDR <= 0.05) %>% 
  filter(logFC < 0.5 ) %>%
  summarise(n_down = n())

# format
dex_comb <- left_join(dex_up, dex_down, by = c("group"))
dex_comb$n_down <- dex_comb$n_down * -1
dex_comb_long <- dex_comb %>% pivot_longer(!group, names_to = "direction", values_to = "n")

# plot
dex_comb_long %>%
  ggplot(aes(x = group, y = n, fill = direction)) + 
  geom_bar(stat="identity", position="identity") +
  geom_text(aes(label=n, vjust = -sign(n))) +
  ggtitle("Dex numbers per group")
axr492tv

axr492tv1#

计算满足条件的次数的常用方法是sum()该条件:

dex_summary = dex_df %>%
  group_by(group) %>%
  summarize(
    n_up = sum(FDR <= 0.05 & logFC > 0.5),
    n_down = -sum(FDR <= 0.05 & logFC < 0.5)
  ) %>%
  pivot_longer(-group, names_to = "direction", values_to = "n")

# plot
dex_summary %>%
  ggplot(aes(x = group, y = n, fill = direction)) + 
  ## using geom_col() instead of geom_bar(stat = "identity")
  geom_col() +
  geom_text(aes(label=n, vjust = -sign(n))) +
  ## adding a little padding to the y scale for the numbers
  scale_y_continuous(expand = expansion(add = 0.5)) +
  ggtitle("Dex numbers per group")

以下是简化的示例数据:

set.seed(47)
gene_creator <- paste("gene",1:100,sep="")
genes = sample(gene_creator,8)

dex_A <- data.frame(
  gene = genes,
  group = "group_A",
  logFC = sample(c(-5:5), replace=T, size=8),
  FDR = sample(c(0.01,1), replace=T, size=8)
)

dex_B <- data.frame(
  gene = genes,
  group = "group_B",
  logFC = sample(c(-5:5), replace=T, size=8),
  FDR = sample(c(0.01,1), replace=T, size=8)
)

dex_df <- rbind(dex_A, dex_B)

我建议在图中包含logFC值:

## re-ran sample data with 20 samples per group
dex_df %>%
  filter(FDR <= 0.05 & abs(logFC) > 0.5) %>%
  count(group, logFC) %>%
  mutate(
    direction = sign(logFC),
    n_dir = n * sign(direction)
  ) %>%
  ggplot(aes(x = factor(logFC), y = n_dir, fill = factor(direction))) +
  geom_col() +
  facet_wrap(~group)

相关问题