如何在R中将每个ID多行转换为每个ID单行?

eqzww0vc  于 2023-06-19  发布在  其他
关注(0)|答案(3)|浏览(119)

我有9个月的患者数据。数据集包括4列-患者ID、访视日期、收缩压(SBP)和舒张压(DBP)。患者ID对每个人都是唯一的。每名患者都曾多次到诊所就诊。每次患者访视时,均进行SBP和DBP计数。
样本数据集:

structure(list(PAT_ID = c("PAT134844", "PAT134845", "PAT122062", "PAT134846", "PAT110934", "PAT134844","PAT134845", "PAT134846", "PAT110934", "PAT134846", "PAT122062", "PAT134845", "PAT134844"), 
               VISIT.DATE = c("1/2/19", "1/2/19", "1/2/19", "1/3/19", "1/3/19", "1/4/19", "3/5/19", "4/5/19", "4/5/19", "6/7/19", "6/7/19", "8/8/19", "8/8/19"),
               BP_SYSTOLIC = c("108", "122", "126", "125", "167", "125", "111", "115", "120", "130", "125", "135", "110"),
               BP_DIASTOLIC = c("70", "82", "77", "85", "89", "71", "78", "75", "89", "64", "61", "68", "77")), class = "data.frame", row.names = c(NA, -13L))

我希望我的结果表是-
| 患者ID|日期1| SBP 1| DBP 1|日期2| SBP 2| DBP 2|日期3|所以..|
| - -----|- -----|- -----|- -----|- -----|- -----|- -----|- -----|- -----|
| CH1234| 1月1日|一百二十|八十|五月五日|一百一十五|六十|7月7日||
| CH1235| 1月2日|一百三十|九十|六月六日|一百一十|七十|8月8日||

tuwxkamq

tuwxkamq1#

下面的代码生成您要查找的列,假设dat是您的数据:

library(tidyverse)
dat |> 
  group_by(PAT_ID) |> 
  arrange(VISIT.DATE) |> 
  mutate(num_visit = paste0("Date: ", row_number())) |> 
  pivot_wider(names_from = num_visit,
              values_from = c(VISIT.DATE, BP_SYSTOLIC, BP_DIASTOLIC)) |> 
  ungroup()

其给出:

# A tibble: 5 × 10
# Groups:   PAT_ID [5]
  PAT_ID    `VISIT.DATE_Date: 1` `VISIT.DATE_Date: 2` `VISIT.DATE_Date: 3` `BP_SYSTOLIC_Date: 1` `BP_SYSTOLIC_Date: 2`
  <chr>     <chr>                <chr>                <chr>                <chr>                 <chr>                
1 PAT134844 1/2/19               1/4/19               8/8/19               108                   125                  
2 PAT134845 1/2/19               3/5/19               8/8/19               122                   111                  
3 PAT122062 1/2/19               6/7/19               NA                   126                   125                  
4 PAT134846 1/3/19               4/5/19               6/7/19               125                   115                  
5 PAT110934 1/3/19               4/5/19               NA                   167                   120                  
# ℹ 4 more variables: `BP_SYSTOLIC_Date: 3` <chr>, `BP_DIASTOLIC_Date: 1` <chr>, `BP_DIASTOLIC_Date: 2` <chr>,
#   `BP_DIASTOLIC_Date: 3` <chr>

如果需要,可以重新排序/重命名列。

kkbh8khc

kkbh8khc2#

使用aggregate对数据进行分组,unnest_wider来自tidyr获得宽格式,然后使用order获得所需排列的方法

library(dplyr)
library(tidyr)

res <- aggregate(. ~ PAT_ID, df, c) %>% 
  unnest_wider(-PAT_ID, names_sep="_")

cbind(res[1], res[-1][order(sub(".*_(\\d+)", "\\1", colnames(res)[-1]))])
     PAT_ID VISIT.DATE_1 BP_SYSTOLIC_1 BP_DIASTOLIC_1 VISIT.DATE_2
1 PAT110934       1/3/19           167             89       4/5/19
2 PAT122062       1/2/19           126             77       6/7/19
3 PAT134844       1/2/19           108             70       1/4/19
4 PAT134845       1/2/19           122             82       3/5/19
5 PAT134846       1/3/19           125             85       4/5/19
  BP_SYSTOLIC_2 BP_DIASTOLIC_2 VISIT.DATE_3 BP_SYSTOLIC_3 BP_DIASTOLIC_3
1           120             89         <NA>          <NA>           <NA>
2           125             61         <NA>          <NA>           <NA>
3           125             71       8/8/19           110             77
4           111             78       8/8/19           135             68
5           115             75       6/7/19           130             64
tkqqtvp1

tkqqtvp13#

这接近于How to reshape data from long to wide formatReshape multiple value columns to wide format的复制,尽管两个答案都没有多列names_from=

dplyr

这是从长到宽的重塑。

library(dplyr)
library(tidyr)
quux %>%
  group_by(PAT_ID) %>%
  mutate(rn = row_number()) %>%
  ungroup() %>%
  pivot_wider(id_cols = PAT_ID, names_from = "rn",
              values_from = c("VISIT.DATE", "BP_SYSTOLIC", "BP_DIASTOLIC"), 
              names_sep = "_") %>%
  rename_with(.cols = matches("[0-9]"),
    .fn = function(z) case_when(
      grepl("VISIT", z) ~ gsub(".*_", "Date_", z), 
      grepl("SYSTOLIC", z) ~ gsub(".*_", "SBP_", z), 
      grepl("DIASTOLIC", z) ~ gsub(".*_", "DBP_", z))
  )
# # A tibble: 5 × 10
#   PAT_ID    Date_1 Date_2 Date_3 SBP_1 SBP_2 SBP_3 DBP_1 DBP_2 DBP_3
#   <chr>     <chr>  <chr>  <chr>  <chr> <chr> <chr> <chr> <chr> <chr>
# 1 PAT134844 1/2/19 1/4/19 8/8/19 108   125   110   70    71    77   
# 2 PAT134845 1/2/19 3/5/19 8/8/19 122   111   135   82    78    68   
# 3 PAT122062 1/2/19 6/7/19 NA     126   125   NA    77    61    NA   
# 4 PAT134846 1/3/19 4/5/19 6/7/19 125   115   130   85    75    64   
# 5 PAT110934 1/3/19 4/5/19 NA     167   120   NA    89    89    NA

data.table

library(data.table)
dcast(as.data.table(quux)[, n := seq(.N), PAT_ID], PAT_ID ~ n, value.var = c("VISIT.DATE", "BP_SYSTOLIC", "BP_DIASTOLIC"))
#       PAT_ID VISIT.DATE_1 VISIT.DATE_2 VISIT.DATE_3 BP_SYSTOLIC_1 BP_SYSTOLIC_2 BP_SYSTOLIC_3 BP_DIASTOLIC_1
#       <char>       <char>       <char>       <char>        <char>        <char>        <char>         <char>
# 1: PAT110934       1/3/19       4/5/19         <NA>           167           120          <NA>             89
# 2: PAT122062       1/2/19       6/7/19         <NA>           126           125          <NA>             77
# 3: PAT134844       1/2/19       1/4/19       8/8/19           108           125           110             70
# 4: PAT134845       1/2/19       3/5/19       8/8/19           122           111           135             82
# 5: PAT134846       1/3/19       4/5/19       6/7/19           125           115           130             85
# 2 variables not shown: [BP_DIASTOLIC_2 <char>, BP_DIASTOLIC_3 <char>]

你可以像上面那样重命名。
数据

quux <- structure(list(PAT_ID = c("PAT134844", "PAT134845", "PAT122062", "PAT134846", "PAT110934", "PAT134844", "PAT134845", "PAT134846", "PAT110934", "PAT134846", "PAT122062", "PAT134845", "PAT134844"), VISIT.DATE = c("1/2/19", "1/2/19", "1/2/19", "1/3/19", "1/3/19", "1/4/19", "3/5/19", "4/5/19", "4/5/19", "6/7/19", "6/7/19", "8/8/19", "8/8/19"), BP_SYSTOLIC = c("108", "122", "126", "125", "167", "125", "111", "115", "120", "130", "125", "135", "110"), BP_DIASTOLIC = c("70", "82", "77", "85", "89", "71",  "78", "75", "89", "64", "61", "68", "77")), class = "data.frame", row.names = c(NA, -13L))

相关问题