使用R中的for循环生成多个分层频率表

3qpi33ja  于 2023-01-15  发布在  其他
关注(0)|答案(2)|浏览(142)

我正在尝试生成多个频率表,这些频率表由多个独立变量分层,我可以让它对一个变量和一个分层变量工作,但是我的for循环坏了。

library(tidyverse)
# Create example dataframe of survey data 
df <- data.frame(
var1 = sample(1:7, 1000, replace = TRUE),
var2 = sample(1:7, 1000, replace = TRUE),
var3 = sample(1:7, 1000, replace = TRUE),
var4 = sample(1:7, 1000, replace = TRUE),
var5 = sample(1:7, 1000, replace = TRUE),
var6 = sample(1:7, 1000, replace = TRUE),
strat1 = sample(c("A", "B", "C"), 1000, replace = TRUE),
strat2 = sample(c("X", "Y"), 1000, replace = TRUE),
strat3 = sample(c("True", "False"), 1000, replace = TRUE)
)

这个例子适用于一个变量和一个分层变量。我想把这段代码转换成一个for循环:

temp_df <- df %>% count(var1)
temp_df$percent <- temp_df$n / sum(temp_df$n) * 10
strat_df <- temp_df %>%
  left_join((df %>% group_by(var1, strat1) %>% count(var1) %>% pivot_wider(names_from = strat1, values_from = n)), by = "var1")
for(k in c("A","B","C")){
  strat_df[paste0(k, "_pct")] <- (strat_df[[k]] / temp_df$n) * 100
}

我想要同样的输出,但是添加了其他两个分层变量的count和_pct列。
我试过使用下面的for循环,但是它只给每个变量一行,并且只为每个strat变量生成两列,而我所寻找的输出将为分层变量中的每个类别提供一个原始计数和列百分比列。由于有3个strat变量,其中两个有两个类别,一个有三个类别。我期望的输出将具有13列,包括“v#"、“n”和“percent”列。

# Create a list of the variables of interest 
variables <- c("var1", "var2", "var3", "var4", "var5", "var6")

# Create a list of the stratification variables 
strats <- c("strat1", "strat2", "strat3")

# Create a loop that runs through each variable 
for(i in variables){

# Create a frequency table for the current variable
temp_df <- df %>% count(!! i)

# Add a column for the percent of responses within each response category
temp_df$percent <- temp_df$n / sum(temp_df$n) * 100

# Add a column for the raw count for each category of the stratification variables
for(j in strats){
  temp_df <- temp_df %>% group_by(!!i) %>% mutate( !!j := n() )
}

# Add a column for the percent of the stratification variable category within the response category
for(j in strats){
  temp_df[paste0(j, "_pct")] <- (temp_df[[j]] / temp_df$n) * 100
}
assign(paste0(i,"_df"), temp_df)
}

这是我希望我的输出看起来像:

4nkexdtk

4nkexdtk1#

更新:
想出了一个输出我所需要的解决方案:

for(i in variables){
  j = sym(i)
  temp_df <- df %>% count(!!j)
  temp_df$percent <- temp_df$n / sum(temp_df$n) * 10
  strat_df <- temp_df %>%
    left_join((df %>% group_by(!!j, strat1) %>% count(!!j) %>% pivot_wider(names_from = strat1, values_from = n)), by = i) %>%
    left_join((df %>% group_by(!!j, strat2) %>% count(!!j) %>% pivot_wider(names_from = strat2, values_from = n)), by = i) %>%
    left_join((df %>% group_by(!!j, strat3) %>% count(!!j) %>% pivot_wider(names_from = strat3, values_from = n)), by = i)
  
  for(k in c("A","B","C","X","Y","True","False")){
    strat_df[paste0(k, "_pct")] <- (strat_df[[k]] / temp_df$n) * 100
  }
  assign(paste0(i,"_df"), strat_df)
n7taea2i

n7taea2i2#

转换为sym bol并计算(!!),或者使用across,因为循环的变量是字符串

for(i in variables){

# Create a frequency table for the current variable
temp_df <- df %>% count(across(all_of(i)))

# Add a column for the percent of responses within each response category
temp_df$percent <- temp_df$n / sum(temp_df$n) * 100

# Add a column for the raw count for each category of the stratification variables
 strat_df <- temp_df %>%
    left_join((df %>% group_by(across(all_of(c(i, "strat1")))) %>%
       count(across(all_of(i))) %>%
 pivot_wider(names_from = strat1, values_from = n)), by = i) %>%
   left_join((df %>% group_by(across(all_of(c(i, "strat2")))) %>%
       count(across(all_of(i))) %>%
 pivot_wider(names_from = strat2, values_from = n)), by = i)  %>%
  left_join((df %>% group_by(across(all_of(c(i, "strat3")))) %>%
       count(across(all_of(i))) %>%
 pivot_wider(names_from = strat3, values_from = n)), by = i) 
# Add a column for the percent of the stratification variable category within the response category
for(j in c("A","B","C","X","Y","True","False")){
  strat_df[paste0(j, "_pct")] <- (strat_df[[j]] / temp_df$n) * 100
}
assign(paste0(i,"_df"), strat_df)
}
  • 输出
> var1_df
  var1   n percent  A  B  C  X  Y False True    A_pct    B_pct    C_pct    X_pct    Y_pct True_pct False_pct
1    1 121    12.1 36 42 43 59 62    63   58 29.75207 34.71074 35.53719 48.76033 51.23967 47.93388  52.06612
2    2 144    14.4 51 42 51 84 60    69   75 35.41667 29.16667 35.41667 58.33333 41.66667 52.08333  47.91667
3    3 147    14.7 41 39 67 60 87    73   74 27.89116 26.53061 45.57823 40.81633 59.18367 50.34014  49.65986
4    4 146    14.6 52 45 49 74 72    79   67 35.61644 30.82192 33.56164 50.68493 49.31507 45.89041  54.10959
5    5 165    16.5 51 57 57 86 79    76   89 30.90909 34.54545 34.54545 52.12121 47.87879 53.93939  46.06061
6    6 133    13.3 48 51 34 64 69    68   65 36.09023 38.34586 25.56391 48.12030 51.87970 48.87218  51.12782
7    7 144    14.4 53 44 47 67 77    73   71 36.80556 30.55556 32.63889 46.52778 53.47222 49.30556  50.69444
> var2_df
  var2   n percent  A  B  C  X  Y False True    A_pct    B_pct    C_pct    X_pct    Y_pct True_pct False_pct
1    1 152    15.2 51 53 48 79 73    70   82 33.55263 34.86842 31.57895 51.97368 48.02632 53.94737  46.05263
2    2 147    14.7 49 46 52 73 74    55   92 33.33333 31.29252 35.37415 49.65986 50.34014 62.58503  37.41497
3    3 142    14.2 46 45 51 72 70    79   63 32.39437 31.69014 35.91549 50.70423 49.29577 44.36620  55.63380
4    4 147    14.7 50 48 49 74 73    72   75 34.01361 32.65306 33.33333 50.34014 49.65986 51.02041  48.97959
5    5 128    12.8 45 43 40 59 69    72   56 35.15625 33.59375 31.25000 46.09375 53.90625 43.75000  56.25000
6    6 152    15.2 37 52 63 74 78    83   69 24.34211 34.21053 41.44737 48.68421 51.31579 45.39474  54.60526
7    7 132    13.2 54 33 45 63 69    70   62 40.90909 25.00000 34.09091 47.72727 52.27273 46.96970  53.03030

相关问题