R语言 我如何将这个平均值列表编译成一个表?

8xiog9wr  于 2023-06-27  发布在  其他
关注(0)|答案(1)|浏览(95)

我需要计算几年数据的平均住院时间,取决于患者人口统计学(CPL状态和人口普查或出院状态)。我已经计算了这些的平均值,包括偶然事件,并将它们分配为值,以便我记住它们是什么。我感兴趣的是将值列表编译成一个表,这样我就可以将整个内容导出到Excel中。
这是我的资料

#2014 average
#census#
y2014_cen_all <- mean(inp[(inp$year == 2014),]$los_years)
y2014_cen_cpl <- mean(inp[(inp$year == 2014 & inp$cpl_stat == "cpl"),]$los_years)
y2014_cen_non <- mean(inp[(inp$year == 2014 & inp$cpl_stat == "non"),]$los_years)

#discharge
y2014_dis_all <- mean(inp[(inp$year == 2014 & inp$census_discharge == "disch"),]$los_years)
y2014_dis_all_count <- nrow(inp[inp$year == 2014 & inp$census_discharge == "disch",])  

y2014_dis_cpl <- mean(inp[(inp$year == 2014 & inp$cpl_stat == "cpl" & inp$census_discharge == "disch"),]$los_years)
y2014_dis_cpl_count <- nrow(inp[inp$year == 2014 & inp$cpl_stat == "cpl" & inp$census_discharge == "disch",])  

y2014_dis_non <- mean(inp[(inp$year == 2014 & inp$cpl_stat == "non" & inp$census_discharge == "disch"),]$los_years)
y2014_dis_non_count <- nrow(inp[inp$year == 2014 & inp$cpl_stat == "non" & inp$census_discharge == "disch",])  

#2015 average
#census#
y2015_cen_all <- mean(inp[(inp$year == 2015),]$los_years)
y2015_cen_cpl <- mean(inp[(inp$year == 2015 & inp$cpl_stat == "cpl"),]$los_years)
y2015_cen_non <- mean(inp[(inp$year == 2015 & inp$cpl_stat == "non"),]$los_years)

#discharge
y2015_dis_all <- mean(     inp[(inp$year == 2015 & inp$census_discharge == "disch"),]$los_years)
y2015_dis_all_count <- nrow(inp[inp$year == 2015 & inp$census_discharge == "disch",])  

y2015_dis_cpl <- mean(     inp[(inp$year == 2015 & inp$cpl_stat == "cpl" & inp$census_discharge == "disch"),]$los_years)
y2015_dis_cpl_count <- nrow(inp[inp$year == 2015 & inp$cpl_stat == "cpl" & inp$census_discharge == "disch",])  

y2015_dis_non <- mean(     inp[(inp$year == 2015 & inp$cpl_stat == "non" & inp$census_discharge == "disch"),]$los_years)
y2015_dis_non_count <- nrow(inp[inp$year == 2015 & inp$cpl_stat == "non" & inp$census_discharge == "disch",])  

#2016 average
#census#
y2016_cen_all <- mean(inp[(inp$year == 2016),]$los_years)
y2016_cen_cpl <- mean(inp[(inp$year == 2016 & inp$cpl_stat == "cpl"),]$los_years)
y2016_cen_non <- mean(inp[(inp$year == 2016 & inp$cpl_stat == "non"),]$los_years)

#discharge
y2016_dis_all <- mean(     inp[(inp$year == 2016 & inp$census_discharge == "disch"),]$los_years)
y2016_dis_all_count <- nrow(inp[inp$year == 2016 & inp$census_discharge == "disch",])  

y2016_dis_cpl <- mean(     inp[(inp$year == 2016 & inp$cpl_stat == "cpl" & inp$census_discharge == "disch"),]$los_years)
y2016_dis_cpl_count <- nrow(inp[inp$year == 2016 & inp$cpl_stat == "cpl" & inp$census_discharge == "disch",])  

y2016_dis_non <- mean(     inp[(inp$year == 2016 & inp$cpl_stat == "non" & inp$census_discharge == "disch"),]$los_years)
y2016_dis_non_count <- nrow(inp[inp$year == 2016 & inp$cpl_stat == "non" & inp$census_discharge == "disch",])  

#2017 average
#census#
y2017_cen_all <- mean(inp[(inp$year == 2017),]$los_years)
y2017_cen_cpl <- mean(inp[(inp$year == 2017 & inp$cpl_stat == "cpl"),]$los_years)
y2017_cen_non <- mean(inp[(inp$year == 2017 & inp$cpl_stat == "non"),]$los_years)

#discharge
y2017_dis_all <- mean(     inp[(inp$year == 2017 & inp$census_discharge == "disch"),]$los_years)
y2017_dis_all_count <- nrow(inp[inp$year == 2017 & inp$census_discharge == "disch",])  

y2017_dis_cpl <- mean(     inp[(inp$year == 2017 & inp$cpl_stat == "cpl" & inp$census_discharge == "disch"),]$los_years)
y2017_dis_cpl_count <- nrow(inp[inp$year == 2017 & inp$cpl_stat == "cpl" & inp$census_discharge == "disch",])  

y2017_dis_non <- mean(     inp[(inp$year == 2017 & inp$cpl_stat == "non" & inp$census_discharge == "disch"),]$los_years)
y2017_dis_non_count <- nrow(inp[inp$year == 2017 & inp$cpl_stat == "non" & inp$census_discharge == "disch",])

它们都储存在环境中。
我如何创建一个看起来像这样的表:
| 年份|总平均值|CPL平均值|总计,不包括CPL平均值|
| - -----|- -----|- -----|- -----|
| 二○一四年|平均|平均|平均|
| 2015年|平均|平均|平均|
| 2016年|平均|平均|平均|
| 2017年|平均|平均|平均|

编辑

我只使用一个数据集,这里是一个例子:

下面是运行上述代码时的环境截图:

以下是dput(head(inp, n = 10))的输出,删除了敏感信息:

#> dput(head(inp, n = 10))
structure(list(unit = c("065:Ward 65 Elder Rehab Medical Unit", 
"065:Ward 65 Elder Rehab Medical Unit", "065:Ward 65 Elder Rehab Medical Unit", 
"065:Ward 65 Elder Rehab Medical Unit", "065:Ward 65 Elder Rehab Medical Unit", 
"065:Ward 65 Elder Rehab Medical Unit", "065:Ward 65 Elder Rehab Medical Unit", 
"065:Ward 65 Elder Rehab Medical Unit", "065:Ward 65 Elder Rehab Medical Unit", 
"065:Ward 65 Elder Rehab Medical Unit"), 
    admission_date = structure(c(1329091200, 1349222400, 1253059200, 
    1403568000, 1419292800, 1194393600, 1265241600, 1021420800, 
    777859200, 1357603200), tzone = "UTC", class = c("POSIXct", 
    "POSIXt")), discharge_date = c(".", ".", ".", ".", ".", ".", 
    ".", ".", ".", "."), discharge_month = c(".", ".", ".", ".", 
    ".", ".", ".", ".", ".", "."), disposition = c(".", ".", 
    ".", ".", ".", ".", ".", ".", ".", "."), los_days = c(1053, 
    820, 1933, 191, 9, 2612, 1792, 4614, 7433, 723), los_years = c(2.88493150684932, 
    2.24657534246575, 5.2958904109589, 0.523287671232877, 0.024657534246575, 
    7.15616438356164, 4.90958904109589, 12.641095890411, 20.3643835616438, 
    1.98082191780822), gender = c("Male", "Female", "Male", "Male", 
    "Female", "Male", "Male", "Female", "Female", "Male"), legal_status = c(21, 
    32, 32, 32, 20, 21, 32, 32, 32, 32), cpl_stat = c("non", 
    "non", "non", "non", "non", "non", "non", "non", "non", "non"
    ), prin_diag = c("Schizophrenia, Undifferentiated Type", 
    "Schizophrenia", "Schizophrenia", "Schizoaffective Disorder", 
    "Schizoaffective disorder, Bipolar type", "Schizoaffective disorder, Bipolar type", 
    "Schizophrenia, Paranoid Type", "Schizophrenia, Undifferentiated Type", 
    "Schizoaffective disorder, Bipolar type", "Schizophrenia, Paranoid Type"
    ), year = c(2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 
    2014, 2014), census_discharge = c("cen", "cen", "cen", "cen", 
    "cen", "cen", "cen", "cen", "cen", "cen"), inp_res = c("inp", 
    "inp", "inp", "inp", "inp", "inp", "inp", "inp", "inp", "inp"
    )), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"
))
xytpbqjk

xytpbqjk1#

这里最好的办法是保留原始数据集(inp),并通过对inp进行分组和整形来实现摘要。

“整洁”表

开始加载您的资源...

# Load the necessary packages...
library(dplyr)  # ...for manipulating data.

# ...
# Get your dataset: 'inp'
# ...

...然后应用此工作流:

out <- inp %>%
  # Declutter the data.
  select(year, cpl_stat, census_discharge, los_years
    
  # Group by year...
  ) %>% group_by(year
    
  # ...and add columns with yearly aggregates (listed for every row).
  ) %>% mutate(
    avg_year = mean(los_years),
    n_year = n()
    
  # Group further by census/discharge ("cendis") status...
  ) %>% group_by(census_discharge
    
  # ...and add columns with yearly aggregates by that status (for every row).
  ) %>% mutate(
    avg_year_cendis = mean(los_years),
    n_year_cendis = n()
    
  # Group further by CPL status...
  ) %>% group_by(cpl_stat
    
  # ...and summarize with yearly aggregates for combinations of statuses.
  ) %>% summarize(
    avg_year_cendis_cpl = mean(los_years),
    n_year_cendis_cpl = n(),
    
    # Preserve the earlier groupings...
    year = first(year),
    census_discharge = first(census_discharge),
    
    # ...and their aggregates.
    avg_year = first(avg_year),
    n_year = first(n_year),
    avg_year_cendis = first(avg_year_cendis),
    n_year_cendis = first(n_year_cendis)
    
  # Reorganize the result in order of grouping.
  ) %>% select(
    year, census_discharge, cpl_stat,
    avg_year, n_year,
    avg_year_cendis, n_year_cendis,
    avg_year_cendis_cpl, n_year_cendis_cpl
  )

给定一个inp ut,就像你的例子一样

inp <- structure(
  list(
    unit = c(
      "065:Ward 65 Elder Rehab Medical Unit", "065:Ward 65 Elder Rehab Medical Unit",
      "065:Ward 65 Elder Rehab Medical Unit", "065:Ward 65 Elder Rehab Medical Unit",
      "065:Ward 65 Elder Rehab Medical Unit", "065:Ward 65 Elder Rehab Medical Unit",
      "065:Ward 65 Elder Rehab Medical Unit", "065:Ward 65 Elder Rehab Medical Unit",
      "065:Ward 65 Elder Rehab Medical Unit", "065:Ward 65 Elder Rehab Medical Unit"
    ),
    admission_date = structure(
      c(
        1329091200, 1349222400, 1253059200, 1403568000, 1419292800,
        1194393600, 1265241600, 1021420800,  777859200, 1357603200
      ),
      tzone = "UTC",
      class = c("POSIXct", "POSIXt")
    ),
    discharge_date  = c(".", ".", ".", ".", ".", ".", ".", ".", ".", "."),
    discharge_month = c(".", ".", ".", ".", ".", ".", ".", ".", ".", "."),
    disposition     = c(".", ".", ".", ".", ".", ".", ".", ".", ".", "."),
    los_days = c(1053, 820, 1933, 191, 9, 2612, 1792, 4614, 7433, 723),
    los_years = c(
      2.88493150684932, 2.24657534246575,  5.2958904109589, 0.523287671232877, 0.024657534246575,
      7.15616438356164, 4.90958904109589, 12.641095890411, 20.3643835616438,   1.98082191780822
    ),
    gender = c("Male", "Female", "Male", "Male", "Female", "Male", "Male", "Female", "Female", "Male"),
    legal_status = c(21, 32, 32, 32, 20, 21, 32, 32, 32, 32),
    cpl_stat = c("non", "non", "non", "non", "non", "non", "non", "non", "non", "non"),
    prin_diag = c(
      "Schizophrenia, Undifferentiated Type",   "Schizophrenia",
      "Schizophrenia",                          "Schizoaffective Disorder",
      "Schizoaffective disorder, Bipolar type", "Schizoaffective disorder, Bipolar type",
      "Schizophrenia, Paranoid Type",           "Schizophrenia, Undifferentiated Type",
      "Schizoaffective disorder, Bipolar type", "Schizophrenia, Paranoid Type"
    ),
    year = c(2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014),
    census_discharge = c("cen", "cen", "cen", "cen", "cen", "cen", "cen", "cen", "cen", "cen"),
    inp_res = c("inp", "inp", "inp", "inp", "inp", "inp", "inp", "inp", "inp", "inp")
  ),
  row.names = c(NA, -10L),
  class = c("tbl_df", "tbl", "data.frame")
)

这将产生一个整洁的out put,其中**yearcensus_dischargecpl_stat的每个组合都有一行**。

# A tibble: 1 × 9
   year census_discharge cpl_stat avg_year n_year avg_year_cendis n_year_cendis avg_year_cendis_cpl n_year_cendis_cpl
  <dbl> <chr>            <chr>       <dbl>  <int>           <dbl>         <int>               <dbl>             <int>
1  2014 cen              non          5.80     10            5.80            10                5.80                10
    • 为分组的每个“级别”列出度量(avg_*n_*)。**例如,考虑avg_*度量:
  • avg_year冗余地表现为循环year的平均值;然后
  • avg_year_cendis冗余地出现为yearcensus_discharge状态的重复组合的平均值;最后
  • avg_year_cendis_cpl只出现一次,作为yearcensus_discharge状态和cpl_stat us的 * 唯一 * 组合的平均值。

注意事项

完全巧合的是,度量值(avg_*n_*)在所有级别的分组(_year_cendis_cpl)中都是相同的(5.8010)。这是因为样本数据(inp)只有一个分组的数据:2014是唯一的year"cen"是唯一的census_discharge"non"是唯一的cpl_stat。* * 对于 * 完整 * inp数据集,这些度量值会有所不同,因为它们应该有所不同。**

“宽”表

要实现所需的形式,即每个组合都有一个 * 列 *(而不是一个 * 行 *),必须应用进一步的步骤。首先加载附加资源

# Load the necessary packages...
library(tidyr)    # ...for reshaping data...
library(stringr)  # ...and for manipulating strings.

...然后应用此工作流来调整数据。

out %>%
  # Prefix columns with '.' to avoid clashes.
  rename_with(~paste0(".", .)
  
  # Pivot to get a measure column for every CPL status.
  ) %>% pivot_wider(
    # Prefix the columns with the CPL status...
    names_from = .cpl_stat,
    # ...and append the aggregate's label: '*' from '*_year_cendis_cpl'.
    names_glue = "{.cpl_stat}_{
      stringr::str_extract(.value,
        pattern = '^\\\\.?(.*?)(?:_year_cendis_cpl|$)',
        group = 1
      )
    }",
    
    # Target all aggregates of the form '*_year_cendis_cpl'.
    values_from = ends_with("_year_cendis_cpl"),
    
    # Represent any missing values as "blanks" (NAs).
    values_fill = NA
    
  # Pivot to get a measure column for every census/discharge status.
  ) %>% pivot_wider(
    # Prefix the columns with the census/discharge status...
    names_from = .census_discharge,
    # ...and append either the new measure's name or the aggregate's label.
    names_glue = "{
      # Get only the first three letters: 'cen' and 'dis'
      stringr::str_sub(.census_discharge, end = 3)
    }{
      # Label with '_all' where applicable.
      ifelse(str_starts(.value, '\\\\.'), '_all', '')
    }_{
      stringr::str_extract(.value,
        pattern = '^\\\\.?(.*?)(?:_year_cendis|$)',
        group = 1
      )
    }",
    
    # Target all new measures and all aggregates of the form '*_year_cendis'.
    values_from = !starts_with(".") | ends_with("_year_cendis"),
    
    # Represent any missing values as "blanks" (NAs).
    values_fill = NA
    
  # Clean up the names of columns left unpivoted.
  ) %>% rename_with(.cols = starts_with("."), ~paste0(
    # Prepend "total_" where appropriate.
    ifelse(str_ends(., "_year"), "total_", ""),
    str_extract(.,
      # Extract any aggregate's label: '*' from '.*_year'.
      pattern = '^\\.?(.*?)(?:_year|$)',
      group = 1
    )
    
  # Sort the measures alphabetically.
  )) %>% select(
    year, starts_with("total"), sort(names(.))
  )

如前所述,给定一个inp,这应该会产生以下输出:

# A tibble: 1 × 7
   year total_avg total_n cen_all_avg cen_all_n cen_non_avg cen_non_n
  <dbl>     <dbl>   <int>       <dbl>     <int>       <dbl>     <int>
1  2014      5.80      10        5.80        10        5.80        10
    • 请注意,census_discharge status和cpl_stat us的每个组合都显示一个单独的度量值列。**例如,对于非CPL的人口普查患者,*_avg标记为cen_non_avg*_all_avg还用于标记所有CPL状态的总数。

最后,请注意total_*列如何描述全年所有状态的度量。

注意事项

    • 只要正确标记每个度量值,无论引入多少度量值,这都应该有效。**如果要通过sd()引入标准差,可以在第一个工作流中将sd_year = sd(los_years)添加到avg_year = mean(los_years)的正下方;对于sd_year_cendissd_year_cendis_cpl等等。
    • 现在,当数据填充了更广泛的状态选择时,您可以期望结果扩展“更广泛”:您的 * full * inp数据集。**使用完整的数据集,我们希望看到更多列,如dis_cpl_avg
# A tibble: n × 15
   year total_avg total_n cen_all_avg cen_all_n cen_cpl_avg cen_cpl_n cen_non_avg cen_non_n dis_all_avg dis_all_n dis_cpl_avg dis_cpl_n dis_non_avg dis_non_n
  <dbl>     <dbl>   <int>       <dbl>     <int>       <dbl>     <int>       <dbl>     <int>       <dbl>     <int>       <dbl>     <int>       <dbl>     <int>

显然,你可以把它简化为平均值

# A tibble: n × 8
   year total_avg cen_all_avg cen_cpl_avg cen_non_avg dis_all_avg dis_cpl_avg dis_non_avg
  <dbl>     <dbl>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>

通过简单地从第一个工作流中省略n_* = ...行。你也可以在最后做同样的事情,比如... %>% select(!ends_with("_n"))。* * 我建议只在最后排除度量值,这样您就不会丢失任何信息:你可以选择你想要的东西**

相关问题