R:一次使用不同窗口计算参数的滚动平均值

xkftehaa  于 2023-02-06  发布在  其他
关注(0)|答案(2)|浏览(111)

如果这是一个如下所示的 Dataframe :

ID   ParameterID Time                   value   group_end_time      group_start_time   
       <dbl>       <dbl> <dttm>                    <dbl> <dttm>              <dttm>             
 1         1           1 2022-01-01 10:05:00           1 2022-01-01 10:20:00 2022-01-01 10:05:00
 2         1           2 2022-01-01 10:05:00           1 2022-01-01 10:20:00 2022-01-01 09:50:00
 3         1           1 2022-01-01 10:10:00           2 2022-01-01 10:20:00 2022-01-01 10:05:00
 4         1           2 2022-01-01 10:10:00           2 2022-01-01 10:20:00 2022-01-01 09:50:00
 5         1           1 2022-01-01 10:15:00           3 2022-01-01 10:20:00 2022-01-01 10:05:00
 6         1           1 2022-01-01 10:20:00           4 2022-01-01 10:20:00 2022-01-01 10:05:00
 7         1           1 2022-01-01 10:25:00           5 2022-01-01 10:35:00 2022-01-01 10:20:00
 8         1           1 2022-01-01 10:30:00           6 2022-01-01 10:35:00 2022-01-01 10:20:00
 9         1           2 2022-01-01 10:30:00           3 2022-01-01 10:35:00 2022-01-01 10:05:00
10         1           2 2022-01-01 11:36:00           4 2022-01-01 11:50:00 2022-01-01 11:20:00

现在我想计算每个ParameterID的平均值,其含义如下。对于参数的每个group_end_time,应计算value的平均值,包括此ParameterID的所有观测值和Time >= group_start_time & Time < group_end_time。我的工作方法是引入一个自定义summarise函数:

df %>%
  group_by(ID, ParameterID, group_end_time) %>%
  summarise(aggregation_function(ID, ParameterID, group_end_time, group_start_time, .))


aggregation_function <- function(id, par_id, end_time, start_time, full_data) {
  ret <- full_data %>%
    filter(ID == id[[1]] & ParameterID == par_id[[1]] &
             Time < end_time[[1]] & Time >= start_time[[1]]) %>%
    group_by(PatientID, ParameterID) %>%
    summarise(mean = mean(value, na.rm = TRUE)
    )
  return(ret)
}

输出结果如下:

ret
# A tibble: 5 × 4
# Groups:   PatientID, ParameterID [2]
        ID   ParameterID group_end_time       mean
      <dbl>       <dbl> <dttm>              <dbl>
1         1           1 2022-01-01 10:20:00   2  
2         1           2 2022-01-01 10:20:00   1.5
3         1           1 2022-01-01 10:35:00   5  
4         1           2 2022-01-01 10:35:00   2  
5         1           2 2022-01-01 11:50:00   4

虽然这样做是可行的,但对于大型数据集来说,它非常慢,所以我的方法不实用。你有什么想法来加快速度吗?

e0bqpujr

e0bqpujr1#

使用df和SQL,下面的基准测试显示它运行速度快了20倍。这可能适用于更大的数据集,也可能不适用,但你可以尝试一下。只需将Note复制并粘贴到R的一个新示例中,然后复制并粘贴下面的代码。

library(dplyr)
library(microbenchmark)
library(sqldf)

aggregation_function <- function(id, par_id, end_time, start_time, full_data) {
  ret <- full_data %>%
    filter(ID == id[[1]] & ParameterID == par_id[[1]] &
             Time < end_time[[1]] & Time >= start_time[[1]]) %>%
    group_by(ID, ParameterID) %>%
    summarise(mean = mean(value, na.rm = TRUE)
    )
  return(ret)
}

SQL <- "select A.ID, A.ParameterID, A.[group_end_time], avg(B.value) mean
  from df A
  left join df B on A.ID = B.ID and 
     A.ParameterID = B.ParameterID and 
     B.Time >= A.[group_start_time] and B.Time < A.[group_end_time]
  group by A.ID, A.ParameterID, A.[group_end_time]
  order by A.[group_end_time], A.ParameterID"

microbenchmark(times = 10,
  sql = sqldf(SQL), 
  orig = df %>%
  group_by(ID, ParameterID, group_end_time) %>%
  summarise(aggregation_function(ID, ParameterID, group_end_time, group_start_time, .))
)
## Unit: milliseconds
##  expr       min        lq      mean     median        uq       max neval cld
##   sql   34.9131   35.3101   59.1878   36.72465   41.7658  199.0033    10  a 
##  orig 1237.8522 1249.5308 1328.3235 1293.59445 1352.1843 1665.4883    10   b

注解

df <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
    ParameterID = c(1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L), 
    Time = structure(c(1641049500, 1641049500, 1641049800, 1641049800, 
    1641050100, 1641050400, 1641050700, 1641051000, 1641051000, 
    1641054960), class = c("POSIXct", "POSIXt"), tzone = ""), 
    value = c(1L, 1L, 2L, 2L, 3L, 4L, 5L, 6L, 3L, 4L), group_end_time = structure(c(1641050400, 
    1641050400, 1641050400, 1641050400, 1641050400, 1641050400, 
    1641051300, 1641051300, 1641051300, 1641055800), class = c("POSIXct", 
    "POSIXt"), tzone = ""), group_start_time = structure(c(1641049500, 
    1641048600, 1641049500, 1641048600, 1641049500, 1641049500, 
    1641050400, 1641050400, 1641049500, 1641054000), class = c("POSIXct", 
    "POSIXt"), tzone = "")), row.names = c("1", "2", "3", "4", 
"5", "6", "7", "8", "9", "10"), class = "data.frame")
xhv8bpkk

xhv8bpkk2#

我们可以用apply从R开始算。

library(dplyr)

intervals_times <- 
df |> 
  select(ID, ParameterID, group_start_time, group_end_time) |> 
  dplyr::distinct()

foo <- function(a,x){
 
  y <- x [with(x, 
            ID == a[1] &
            ParameterID  == a[2] &
            Time >= a[3] &
            Time < a[4]
            ),]
  mean(y$value)
}

bind_cols(intervals_times,mean = apply(intervals_times, 1, foo, x = df ))
#>   ID ParameterID group_start_time   group_end_time mean
#> 1  1           1 01/01/2022 10:05 01/01/2022 10:20  2.0
#> 2  1           1 01/01/2022 10:20 01/01/2022 10:35  5.0
#> 3  1           2 01/01/2022 09:50 01/01/2022 10:20  1.5
#> 4  1           2 01/01/2022 10:05 01/01/2022 10:35  2.0
#> 5  1           2 01/01/2022 11:20 01/01/2022 11:50  4.0
    • 数据**
df <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
    ParameterID = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), 
    Time = c("01/01/2022 10:05", "01/01/2022 10:10", "01/01/2022 10:15", 
    "01/01/2022 10:20", "01/01/2022 10:25", "01/01/2022 10:30", 
    "01/01/2022 10:05", "01/01/2022 10:10", "01/01/2022 10:30", 
    "01/01/2022 11:36"), value = c(1L, 2L, 3L, 4L, 5L, 6L, 1L, 
    2L, 3L, 4L), group_start_time = c("01/01/2022 10:05", "01/01/2022 10:05", 
    "01/01/2022 10:05", "01/01/2022 10:05", "01/01/2022 10:20", 
    "01/01/2022 10:20", "01/01/2022 09:50", "01/01/2022 09:50", 
    "01/01/2022 10:05", "01/01/2022 11:20"), group_end_time = c("01/01/2022 10:20", 
    "01/01/2022 10:20", "01/01/2022 10:20", "01/01/2022 10:20", 
    "01/01/2022 10:35", "01/01/2022 10:35", "01/01/2022 10:20", 
    "01/01/2022 10:20", "01/01/2022 10:35", "01/01/2022 11:50"
    )), class = "data.frame", row.names = c(NA, -10L))

将原始函数与apply进行比较:

aggregation_function <- function(id, par_id, end_time, start_time, full_data) {
  ret <- full_data %>%
    filter(ID == id[[1]] & ParameterID == par_id[[1]] &
             Time < end_time[[1]] & Time >= start_time[[1]]) %>%
    group_by(ID, ParameterID) %>%
    summarise(mean = mean(value, na.rm = TRUE)
    )
  return(ret)
}
microbenchmark::microbenchmark(times = 10,
               op = df %>%
                 group_by(ID, ParameterID, group_end_time) %>%
                 summarise(aggregation_function(ID, ParameterID, group_end_time, group_start_time, .)),
               apply = df |> 
                 select(ID, ParameterID, group_start_time, group_end_time) |> 
                 dplyr::distinct() |> 
                 bind_cols(mean = apply(intervals_times, 1, foo, x = df ))
)
#> Unit: milliseconds
#>   expr      min       lq      mean    median      uq      max neval cld
#>     op 391.2939 392.4306 397.69170 393.18675 395.001 438.0414    10   b
#>  apply   4.1947   4.4113   4.85377   4.61445   4.792   7.4095    10  a

相关问题