用其他分组行R的平均值替换NA

s4chpxco  于 2023-09-27  发布在  其他
关注(0)|答案(3)|浏览(118)

我需要用相同的citytimetr的平均值替换没有任何数据的NA值。
我给予一个可重复的例子:

city <- c(rep("PE", 12), rep("BN", 12), rep("CA", 12))  
time <- c(rep("T0",6 ), rep("T1", 6), rep("T0",6 ), rep("T1", 6), rep("T0",6), rep("T1", 6))
tr <- c(rep("CN", 3), rep("TN", 3), rep("CN", 3), rep("TN", 3),rep("CN", 3), rep("TN", 3), rep("CN", 3), rep("TN", 3), rep("CN", 3), rep("TN", 3), rep("CN", 3), rep("TN", 3))
values<- c(20, 15,32, 16,42,56, rep(NA, 6), 23, 10,14, 41,34,56, rep(NA, 6), 23, 11,14, 21,35,46, rep(NA, 6))
df <- data.frame(city, time, tr, values)

输出应该是在PE中T1-CN和T1-TN,我想分别用T0-CN和T1-CN的平均值替换NA,
我已经在这篇文章中检查了解决方案:Replace NA with grouped means in R?但这不是我真正需要的。我怎么能这么做呢?非常感谢你的帮助!

vshtjzan

vshtjzan1#

考虑ifelse条件中的ave,并使用within提供上下文,其中可以在没有 Dataframe $限定符的情况下引用列:

df <- within(
  df, {
    values <- ifelse(
      is.na(values), 
      ave(values, city, time, tr, FUN=\(x) mean(x, na.rm=TRUE)),
      values
    )
  }
)
gab6jxml

gab6jxml2#

这是你想要的吗?

library(dplyr)
df %>% 
  group_by(city, tr) %>% 
  mutate(city_tr_mean = mean(values, na.rm = T),
         new_val = coalesce(values, city_tr_mean),
         city_tr_mean = NULL)
    
# A tibble: 36 × 5
# Groups:   city, tr [6]
   city  time  tr    values new_val
   <chr> <chr> <chr>  <dbl>   <dbl>
 1 PE    T0    CN        20    20  
 2 PE    T0    CN        15    15  
 3 PE    T0    CN        32    32  
 4 PE    T0    TN        16    16  
 5 PE    T0    TN        42    42  
 6 PE    T0    TN        56    56  
 7 PE    T1    CN        NA    22.3
 8 PE    T1    CN        NA    22.3
 9 PE    T1    CN        NA    22.3
10 PE    T1    TN        NA    38  
11 PE    T1    TN        NA    38  
12 PE    T1    TN        NA    38  
13 BN    T0    CN        23    23  
14 BN    T0    CN        10    10  
15 BN    T0    CN        14    14  
16 BN    T0    TN        41    41  
17 BN    T0    TN        34    34  
18 BN    T0    TN        56    56  
19 BN    T1    CN        NA    15.7
20 BN    T1    CN        NA    15.7
21 BN    T1    CN        NA    15.7
22 BN    T1    TN        NA    43.7
23 BN    T1    TN        NA    43.7
24 BN    T1    TN        NA    43.7
25 CA    T0    CN        23    23  
26 CA    T0    CN        11    11  
27 CA    T0    CN        14    14  
28 CA    T0    TN        21    21  
29 CA    T0    TN        35    35  
30 CA    T0    TN        46    46  
31 CA    T1    CN        NA    16  
32 CA    T1    CN        NA    16  
33 CA    T1    CN        NA    16  
34 CA    T1    TN        NA    34  
35 CA    T1    TN        NA    34  
36 CA    T1    TN        NA    34
a0zr77ik

a0zr77ik3#

下面是一个tidyverse解决方案。
按城市和时间分组计算平均值,用以前的非缺失值填写NA,只保留新列。使用此结果绑定原始数据框。然后,每当值为NA时,用平均值替换(mutate)值。

city <- c(rep("PE", 12), rep("BN", 12), rep("CA", 12))
time <- c(rep("T0",6 ), rep("T1", 6), rep("T0",6 ), rep("T1", 6), rep("T0",6), rep("T1", 6))
tr <- c(rep("CN", 3), rep("TN", 3), rep("CN", 3), rep("TN", 3),rep("CN", 3), rep("TN", 3), rep("CN", 3), rep("TN", 3), rep("CN", 3), rep("TN", 3), rep("CN", 3), rep("TN", 3))
values<- c(20, 15,32, 16,42,56, rep(NA, 6), 23, 10,14, 41,34,56, rep(NA, 6), 23, 11,14, 21,35,46, rep(NA, 6))
df <- data.frame(city, time, tr, values)

suppressPackageStartupMessages({
  library(tidyverse)
})

df %>%
  bind_cols(
    df %>%
      group_by(city, time) %>%
      mutate(Mean = mean(values)) %>%
      ungroup() %>%
      select(Mean) %>%
      fill(Mean)
  ) %>%
  mutate(values = if_else(is.na(values), Mean, values)) %>%
  select(-Mean)
#>    city time tr   values
#> 1    PE   T0 CN 20.00000
#> 2    PE   T0 CN 15.00000
#> 3    PE   T0 CN 32.00000
#> 4    PE   T0 TN 16.00000
#> 5    PE   T0 TN 42.00000
#> 6    PE   T0 TN 56.00000
#> 7    PE   T1 CN 30.16667
#> 8    PE   T1 CN 30.16667
#> 9    PE   T1 CN 30.16667
#> 10   PE   T1 TN 30.16667
#> 11   PE   T1 TN 30.16667
#> 12   PE   T1 TN 30.16667
#> 13   BN   T0 CN 23.00000
#> 14   BN   T0 CN 10.00000
#> 15   BN   T0 CN 14.00000
#> 16   BN   T0 TN 41.00000
#> 17   BN   T0 TN 34.00000
#> 18   BN   T0 TN 56.00000
#> 19   BN   T1 CN 29.66667
#> 20   BN   T1 CN 29.66667
#> 21   BN   T1 CN 29.66667
#> 22   BN   T1 TN 29.66667
#> 23   BN   T1 TN 29.66667
#> 24   BN   T1 TN 29.66667
#> 25   CA   T0 CN 23.00000
#> 26   CA   T0 CN 11.00000
#> 27   CA   T0 CN 14.00000
#> 28   CA   T0 TN 21.00000
#> 29   CA   T0 TN 35.00000
#> 30   CA   T0 TN 46.00000
#> 31   CA   T1 CN 25.00000
#> 32   CA   T1 CN 25.00000
#> 33   CA   T1 CN 25.00000
#> 34   CA   T1 TN 25.00000
#> 35   CA   T1 TN 25.00000
#> 36   CA   T1 TN 25.00000

创建于2023-09-17带有reprex v2.0.2

相关问题