R：对所有值使用group_by

628mspwn 于 2023-06-03 发布在其他

关注(0)|答案(5)|浏览(144)

我正在使用R编程语言。
我有以下数据集：

library(dplyr)

df = structure(list(ethnicity = c("c", "c", "c", "b", "c", "b", "b", 
"b", "c", "a", "b", "b", "a", "b", "c", "a", "c", "c", "a", "a", 
"a", "a", "c", "b", "c", "b", "a", "b", "c", "b", "a", "c", "c", 
"a", "c", "b", "a", "c", "a", "a", "b", "c", "c", "a", "c", "a", 
"c", "b", "a", "b", "a", "a", "c", "a", "b", "a", "a", "c", "a", 
"b", "a", "c", "a", "c", "b", "c", "b", "b", "c", "b", "b", "c", 
"c", "a", "b", "b", "a", "b", "a", "a", "b", "c", "c", "a", "b", 
"a", "b", "a", "c", "c", "b", "c", "a", "b", "b", "c", "b", "a", 
"c", "c"), number_of_degrees = c(3L, 2L, 2L, 3L, 1L, 1L, 3L, 
2L, 2L, 2L, 2L, 2L, 2L, 3L, 2L, 1L, 2L, 2L, 2L, 3L, 2L, 3L, 2L, 
3L, 1L, 3L, 3L, 3L, 1L, 3L, 3L, 2L, 2L, 2L, 3L, 3L, 3L, 2L, 1L, 
2L, 1L, 3L, 3L, 2L, 1L, 3L, 1L, 3L, 2L, 2L, 1L, 3L, 2L, 1L, 3L, 
3L, 3L, 1L, 2L, 2L, 1L, 2L, 3L, 3L, 1L, 2L, 1L, 2L, 3L, 3L, 1L, 
3L, 2L, 1L, 1L, 2L, 3L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 3L, 
1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 3L, 3L, 2L, 1L, 2L)), class = "data.frame", row.names = c(NA, 
-100L))

df %>%
    # Group the data by number_of_degrees
    group_by(number_of_degrees) %>%
    # Calculate the percentage of each ethnicity within each group
    summarize(
        percent_a = mean(ethnicity == "a") * 100,
        percent_b = mean(ethnicity == "b") * 100,
        percent_c = mean(ethnicity == "c") * 100
    )

这将产生以下输出：

# A tibble: 3 x 4
  number_of_degrees percent_a percent_b percent_c
              <int>     <dbl>     <dbl>     <dbl>
1                 1      33.3      36.7      30  
2                 2      31.6      21.1      47.4
3                 3      34.4      40.6      25

**我的问题：**是否有一种更“紧凑”的方式来编写此代码，以便我不必手动编写“percent_a”，“percent_B”等？这样，它会更快，并自动为所有种族的价值观。

来源：https://stackoverflow.com/questions/76392631/r-using-group-by-for-all-values

5条答案

按热度按时间

llycmphe1#

下面是一行代码，用于使用这些数据生成一个表。未使用任何包。

100*proportions(table(df[2:1]), 1)

给出：

ethnicity
number_of_degrees        a        b        c
                1 33.33333 36.66667 30.00000
                2 31.57895 21.05263 47.36842
                3 34.37500 40.62500 25.00000

您可以考虑添加页边距，以清楚地表明这些行相加为100。

addmargins(100*proportions(table(df[2:1]), 1), 2)

给出：

ethnicity
number_of_degrees         a         b         c       Sum
                1  33.33333  36.66667  30.00000 100.00000
                2  31.57895  21.05263  47.36842 100.00000
                3  34.37500  40.62500  25.00000 100.00000

赞(0）回复(0）举报 2023-06-03

yvgpqqbh2#

也许您可以尝试这个基本的R选项（列名可能与所需的输出有点不同）

> aggregate(. ~ number_of_degrees, df, \(x) proportions(table(x)))
  number_of_degrees ethnicity.a ethnicity.b ethnicity.c
1                 1   0.3333333   0.3666667   0.3000000
2                 2   0.3157895   0.2105263   0.4736842
3                 3   0.3437500   0.4062500   0.2500000

或

reshape(
    as.data.frame(proportions(table(df), 2)),
    direction = "wide",
    idvar = "number_of_degrees",
    timevar = "ethnicity"
)

它给出了

number_of_degrees    Freq.a    Freq.b    Freq.c
1                 1 0.3333333 0.3666667 0.3000000
4                 2 0.3157895 0.2105263 0.4736842
7                 3 0.3437500 0.4062500 0.2500000

或者，使用dplyr的不太紧凑的选项（抱歉，我对tidyverse的了解有限）

table(rev(df)) %>%
    proportions(1) %>%
    as.data.frame.matrix() %>%
    rownames_to_column(var = "number_of_degrees") %>%
    mutate(number_of_degrees = as.integer(number_of_degrees))

它给出了

number_of_degrees         a         b         c
1                 1 0.3333333 0.3666667 0.3000000
2                 2 0.3157895 0.2105263 0.4736842
3                 3 0.3437500 0.4062500 0.2500000

赞(0）回复(0）举报 2023-06-03

odopli943#

这是一个选项：

df |>
  summarise(n = n(),.by = c(number_of_degrees,ethnicity)) |>
  mutate(pct = n / sum(n),.by = number_of_degrees) |>
  select(-n) |>
  pivot_wider(names_from = ethnicity,
              values_from = pct,
              names_prefix = 'percent_',
              names_sort = TRUE)

赞(0）回复(0）举报 2023-06-03

bvn4nwqk4#

有个办法它不是更紧凑，但没有硬编码ethnicity值。

suppressPackageStartupMessages({
  library(dplyr)
  library(tidyr)
})

df %>%
  count(ethnicity, number_of_degrees) %>%
  group_by(number_of_degrees) %>%
  mutate(n = 100*prop.table(n)) %>%
  pivot_wider(number_of_degrees, 
              names_from = ethnicity, 
              names_glue = "percent_{ethnicity}",
              values_from = n)
#> # A tibble: 3 × 4
#> # Groups:   number_of_degrees [3]
#>   number_of_degrees percent_a percent_b percent_c
#>               <int>     <dbl>     <dbl>     <dbl>
#> 1                 1      33.3      36.7      30  
#> 2                 2      31.6      21.1      47.4
#> 3                 3      34.4      40.6      25

创建于2023-06-02使用reprex v2.0.2

赞(0）回复(0）举报 2023-06-03

gojuced75#

使用mosaic::percs表示百分比。可以使用counts(..., format = 'percent')

mosaic::percs(ethnicity~number_of_degrees, df)[-1]

  number_of_degrees   perc_a   perc_b   perc_c
1                 1 33.33333 36.66667 30.00000
2                 2 31.57895 21.05263 47.36842
3                 3 34.37500 40.62500 25.00000

赞(0）回复(0）举报 2023-06-03

我来回答

R：对所有值使用group_by

5条答案

相关问题

热门标签

最新问答