R pivot_widder,带有多个标题

ttp71kqs  于 2023-01-18  发布在  其他
关注(0)|答案(1)|浏览(115)

我有一个数据集,看起来像这样:
| 投资回报率|集团|识别号|每面积检测数|
| - ------|- ------|- ------|- ------|
| 脑干|A类|京科002|0.0327285333827735美元|
| 小脑|A类|京科002|0.149208050073911|
| 脑干|A类|京科002|0.0336959892253705美元|
| 小脑|A类|京科002|无|
已截断:此处为完整数据集

structure(list(ROI = c("Brainstem", "Cerebellum", "Brainstem", 
"Cerebellum", "Brainstem", "Cerebellum", "Brainstem", "Cerebellum", 
"Brainstem", "Brainstem", "Brainstem", "Cerebellum", "Brainstem", 
"Brainstem", "Cerebellum", "Brainstem", "Brainstem", "Brainstem", 
"Cerebellum", "Cerebellum", "Brainstem", "Brainstem", "Brainstem", 
"Cerebellum", "Brainstem", "Brainstem", "Cerebellum", "Cerebellum", 
"Brainstem", "Brainstem", "Cerebellum", "Brainstem", "Brainstem", 
"Brainstem", "Cerebellum", "Cerebellum", "Brainstem", "Brainstem", 
"Brainstem", "Brainstem", "Cerebellum", "Brainstem", "Cerebellum"
), Group = c("A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "B", "B", "B", "B", "B", "B", 
"B", "B", "B", "B", "B", "B", "B", "B", "B", 
"B", "C", "C", "C", "C", "C", "C", "C", "C", 
"C", "C"), ID = c("JK002", "JK002", "JK002", "JK002", 
"JK003", "JK003", "JK003", "JK003", "JK003", "JK004", "JK004", 
"JK004", "JK004", "JK005", "JK005", "JK005", "JK005", "JK006", 
"JK006", "JK006", "JK006", "JK006", "JK007", "JK007", "JK007", 
"JK008", "JK008", "JK008", "JK008", "JK008", "JK011", "JK011", 
"JK011", "JK011", "JK009", "JK009", "JK009", "JK009", "JK009", 
"JK010", "JK010", "JK010", "JK010"), Detections_per_area = c(0.0327285333827735, 
0.149208050073911, 0.0336959892253705, 0, 0.615965559353422, 
0.117064703241855, 0.171329148144879, 0, 0.131086542762028, 0.143700717389906, 
0.0590510074394172, 0, 0.0610284572370045, 0.358989653774582, 
0.113758644699529, 0.289455536567144, 0, 0.0355596373594045, 
0, 0.0661718135522336, 0.243313220125351, 0.373564943266101, 
0.591264576854053, 0.322774099148268, 0.725656324981598, 0.46343627586687, 
0, 0.227796467592069, 0.0876146804249713, 0.297011825126973, 
0.163792893059129, 0.336883295806429, 1.08791432091601, 0.370788052321182, 
1.49652281121774, 1.18495456669418, 3.36132331547411, 0.951628515574163, 
1.63575637052095, 0.408866067869112, 0, 1.47956381894319, 0.368780820643061
)), row.names = c(NA, -43L), class = "data.frame")

我尝试使用R将其转换为Graphpad棱镜友好格式。这是预期的输出,对应于Detections_per_Area的值填充在(x)中。令人讨厌的是,棱镜要求所有组的大小相同,无论值的数量如何,因此下面为空。
| | A类|A类|A类|A类|A类|乙|乙|乙|乙|乙|C级|C级|C级|C级|C级|
| - ------|- ------|- ------|- ------|- ------|- ------|- ------|- ------|- ------|- ------|- ------|- ------|- ------|- ------|- ------|- ------|
| | 京科002|京科002|京科003|JK004|JK005|京科001|JK009|京科010|无效|无效|JK006|JK007|JK008|京科011|无效|
| 脑干|x|x|x|x|x|x|x|x|无效|无效|x|x|x|x|无效|
| 小脑|x||x|x|x|x|x|x|无效|无效|x|x|x|x|无效|
如果没有ID列,我可以使用以下代码生成预期的输出:

library(tidyr)
library(dplyr)

Count<-data_source %>% count(ROI,Group, sort = TRUE)
highest = as.numeric(Count[1,'n'])
range = seq(1, highest)

  data_wide <- data_source %>% arrange(Group) %>% 
    group_by(ROI,Group) %>% mutate(rn = row_number()) %>% complete(rn = range) %>% 
    pivot_wider(
      names_from = c(Group, rn),
      values_from = Detections_per_area
)
  
names(data_wide) = gsub(pattern = "_.*", replacement = "", x = names(data_wide))
names(data_wide) = gsub(pattern = "ROI", replacement = "", x = names(data_wide))

这将对最大的组进行计数,然后在透视之前生成与之对应的行号。
任何关于最好的方法的建议都是很好的,我想在名称中添加ID,然后删除后透视,但是没有任何运气。

8fsztsew

8fsztsew1#

正如我在评论中所说的,我对如何重用这些ID感到不舒服。从我所知道的来看,没有好的方法来确定“0.0327285333827735”(第1行)应与“0.149208050073911”关联(第3行)或“0”(在第4行)。下面的片段正确地处理了这个问题,假设顺序是有意义的。但是我真的很不舒服“脑干+A+ JK 003”有三行,而“Cerebellum+A+ JK 003”有两个,我强烈建议不要重用这些ID。
我假设这是可以修复的。这就是这个块所模拟的。一个ID不能在ROI-by-Group组合中重用。

data_source <-
  data_source |> 
  dplyr::group_by(ROI, Group, ID) |> 
  dplyr::slice_head() |> 
  dplyr::ungroup()

首先平衡数据集,使每个ROIxGroup具有相同的行数。使用“Null”占位符,如您在上面指定的。

# Determine structure (including the placeholders with "Null")
ds_skeleton <-
  data_source |> 
  dplyr::select(ROI, Group) |>
  dplyr::group_by(ROI, Group) |> 
  dplyr::mutate(
    rn = dplyr::row_number(Group),
  ) |> 
  dplyr::ungroup() |> 
  tidyr::complete(ROI, Group, rn)

# Order the observed data (with holes)
d2 <-
  data_source |> 
  dplyr::select(
    ROI, 
    Group, 
    ID, 
    y = Detections_per_area,    # Shorten to help with SO's narrow window.
  ) |>
  dplyr::group_by(ROI, Group) |> 
  dplyr::mutate(
    rn    = dplyr::row_number(Group),
  ) |> 
  dplyr::ungroup()

然后将结构与值连接。

ds_long <-
  ds_skeleton |> 
  dplyr::left_join(d2, by = c("ROI", "Group", "rn")) |> 
  dplyr::mutate(
    ID  = dplyr::coalesce(ID, "Null"),
    # tag = paste(Group, rn, ID, sep = "_"),
    tag = sprintf(
      "%s_%03i_%s", # Pad the rn to accommodate "001" to "999"
      Group, rn, ID
    ),
    y  = dplyr::if_else(ID == "Null", "Null", as.character(y)),
  ) |> 
  dplyr::select(
    ROI,
    tag,
    y,
  )

# Pivot the values to the wide format
ds_wide <- 
  ds_long |> 
  tidyr::pivot_wider(
    id_cols     = "ROI",
    names_from  = "tag",
    values_from = "y"
  )

最后,构造头部并将其堆叠在真实的数据之上(可能有一种更简洁的方法来处理dplyr::across())。

# Construct the two header rows
pattern <- "^([A-Z])_\\d{3}_(.+)$" # Extracting info from tag
ds_header <- 
  ds_wide |> 
  dplyr::slice(0) |> # Drop the rows b/c we want only the column header
  tibble::add_row(ROI = "header") |>
  tidyr::pivot_longer(
    cols      = -"ROI",
    names_to  = "tag",
    values_to = "header_1"
  ) |>
  dplyr::mutate(
    header_1 = sub(pattern, "\\1", tag),
    header_2 = sub(pattern, "\\2", tag),
  ) |> 
  dplyr::select(-ROI) |>
  tidyr::pivot_longer(
    cols = c("header_1", "header_2"), names_to = "ROI") |>
  tidyr::pivot_wider(names_from = "tag") |>
  dplyr::mutate(
    ROI = ""
  )

# Combine the headers with the data
ds_header  |> 
  dplyr::union_all(ds_wide)

要创建csv,请将col_names = FALSE传递给readr::write_csv()

相关问题