使用group_by和summary,同时仍考虑整个框架上的条件

ecfdbz9o  于 2024-01-03  发布在  其他
关注(0)|答案(1)|浏览(120)

对于每个月(日期_M)在一个框架,我需要得到的总和金额,满足一个双重条件。我需要组_由扇区列,然后总和金额,如果满足以下条件:DATE_M上级或等于DATE_1中的值,并且DATE_M上级或等于DATE_中的值2.这不仅是当月的,而且是整个DF历史上的未付金额。
这里有一个小的可重复的例子:

df <- structure(list(
  DATE_M = structure(c(16374, 16374, 16313, 16313, 
                       16374, 16374, 16343, 16343, 16343, 16343, 16374, 16374, 16374, 
                       16374, 16343, 16343, 16313, 16343, 16343, 16374, 16374, 16313, 
                       16313, 16374, 16313, 16374, 16343, 16313, 16343, 16343, 16374, 
                       16374, 16313, 16313, 16374, 16374, 16343, 16343, 16374, 16313, 
                       16313, 16374, 16313, 16374, 16343, 16374, 16374, 16343, 16343, 
                       16343, 16343, 16343, 16313, 16313, 16313, 16343, 16313, 16374, 
                       16374, 16343, 16313, 16313, 16374, 16313, 16343, 16343, 16374, 
                       16374, 16313, 16343, 16374, 16343, 16343, 16313, 16313, 16343, 
                       16374, 16374, 16374, 16374, 16313, 16374, 16313, 16374, 16374, 
                       16374, 16374, 16313, 16374, 16343, 16313, 16374, 16343, 16374, 
                       16374, 16374, 16343, 16343, 16374, 16343, 16374, 16374, 16374, 
                       16343, 16313, 16343, 16374, 16313, 16374, 16374, 16313, 16343, 
                       16313, 16313, 16374, 16343, 16313, 16313, 16374, 16343, 16313, 
                       16374, 16374, 16343, 16313, 16374, 16343, 16374), class = "Date"), 
  SECTOR = c("AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", 
             "AAA", "AAA", "BBB", "AAA", "BBB", "AAA", "AAA", "AAA", 
             "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", 
             "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", 
             "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "BBB", "AAA", 
             "AAA", "AAA", "BBB", "AAA", "AAA", "AAA", "AAA", "BBB", 
             "AAA", "AAA", "AAA", "BBB", "BBB", "AAA", "BBB", 
             "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "BBB", 
             "AAA", "AAA", "AAA", "AAA", "BBB", "AAA", "AAA", "AAA", 
             "AAA", "BBB", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", 
             "AAA", "AAA", "BBB", "AAA", "AAA", "AAA", "AAA", "AAA", 
             "AAA", "BBB", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", 
             "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", 
             "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "BBB", 
             "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", 
             "AAA", "AAA", "AAA", "AAA", "BBB"), 
  AMOUNT = c(2000, 2500, 
             900, 2500, 900, 2300, 1500, 1500, 5000, 9000, 1050, 2500, 
             3000, 3300, 3000, 2000, 2000, 5000, 2000, 1500, 1000, 600, 
             6000, 200, 1000, 2000, 2000, 1000, 3000, 500, 3300, 3000, 
             500, 4000, 6200, 1000, 3000, 1000, 7000, 1500, 1500, 2000, 
             2000, 5000, 1000, 2000, 1000, 1800, 4000, 2000, 800, 1500, 
             20000, 1500, 500, 23500, 1000, 1000, 2500, 500, 5000, 1000, 
             820, 2500, 500, 40, 3000, 550, 10000, 1000, 3500, 1500, 1000, 
             17000, 2500, 7000, 1200, 1000, 7000, 8500, 10000, 5000, 5500, 
             3000, 500, 6600, 1000, 1000, 1500, 800, 2500, 500, 1000, 
             3000, 4000, 600, 2000, 1800, 1100, 500, 1300, 1000, 1000, 
             250, 3850, 3000, 50, 2000, 3400, 500, 1500, 1700, 1600, 5000, 
             1500, 400, 1500, 1000, 2000, 2500, 8000, 700, 3000, 1200, 
             2200, 2000, 1900, 10000), 
  DATE_1 = structure(c(16371, 16353, 
                       16287, 16294, 16374, 16365, 16329, 16321, 16321, 16324, 16367, 
                       16360, 16373, 16353, 16343, 16332, 16287, 16316, 16328, 16345, 
                       16367, 16302, 16311, 16352, 16303, 16351, 16321, 16300, 16331, 
                       16337, 16370, 16363, 16310, 16300, 16367, 16352, 16321, 16317, 
                       16363, 16308, 16302, 16367, 16293, 16346, 16332, 16359, 16371, 
                       16335, 16339, 16324, 16323, 16331, 16303, 16293, 16286, 16316, 
                       16286, 16356, 16372, 16335, 16287, 16288, 16371, 16300, 16332, 
                       16332, 16359, 16350, 16303, 16338, 16370, 16342, 16338, 16303, 
                       16311, 16328, 16344, 16352, 16349, 16356, 16303, 16346, 16289, 
                       16372, 16349, 16344, 16366, 16311, 16359, 16336, 16294, 16356, 
                       16338, 16358, 16367, 16359, 16338, 16325, 16365, 16339, 16359, 
                       16370, 16350, 16321, 16286, 16323, 16374, 16311, 16345, 16356, 
                       16290, 16322, 16310, 16300, 16352, 16317, 16289, 16301, 16373, 
                       16317, 16294, 16366, 16344, 16325, 16303, 16344, 16329, 16356), class = c("IDate", "Date")), 
  DATE_2 = structure(c(16463, 
                       16401, 16318, 16386, 16434, 16489, 16420, 16412, 16412, 16505, 
                       16461, 16391, 16405, 16444, 16434, 16423, 16343, 16344, 16363, 
                       16402, 16549, 16575, 16358, 16717, 16304, 16384, 16681, 16392, 
                       16364, 16398, 16462, 16394, 16402, 16392, 16398, 16359, 16351, 
                       16408, 16450, 16335, 16335, 16374, 16384, 16377, 16363, 16391, 
                       16402, 16517, 16346, 16357, 16414, 16363, 16395, 16318, 16349, 
                       16317, 16568, 16448, 16464, 16426, 16419, 16380, 16463, 16335, 
                       16513, 16696, 16451, 16562, 16394, 16519, 16401, 16372, 16345, 
                       16395, 16343, 16363, 16435, 16444, 16374, 16448, 16335, 16407, 
                       16335, 16402, 16380, 16435, 16489, 16343, 16402, 16366, 16325, 
                       16479, 16399, 16419, 16398, 16451, 16343, 16416, 16458, 16400, 
                       16451, 16401, 16381, 16412, 16317, 16414, 16402, 16343, 16349, 
                       16538, 16384, 16323, 16342, 16331, 16444, 16409, 16416, 16393, 
                       16434, 16349, 16386, 16398, 16435, 16356, 16335, 16374, 16359, 
                       16359), class = c("IDate", "Date"))), 
  row.names = c(NA, -128L), 
  class = c("data.table", "data.frame"))

字符串
这很容易用for循环来实现:

test_1 = tibble(SECTOR=NULL, SUM=NULL, DATE_M=as.Date(NULL))

for(DD in df$DATE_M %>% as.character %>% unique){

  tmp <- df %>% 
    group_by(SECTOR) %>% 
    filter(DATE_1 <= DD & DATE_2 >= DD) %>% 
    summarize(SUM = sum(AMOUNT)) %>% 
    ungroup %>% 
    mutate(DATE_M=as.Date(DD))

  test_1 <- bind_rows(test_1, tmp)
}


我相信这是正确的结果:

> test_1 %>% arrange(DATE_M)
# A tibble: 6 x 3
  DATE_M     SECTOR    SUM
  <date>     <chr>   <dbl>
1 2014-08-31 AAA     66150
2 2014-08-31 BBB     63000
3 2014-09-30 AAA    106290
4 2014-09-30 BBB     60800
5 2014-10-31 AAA    170860
6 2014-10-31 BBB     63500


我试着用dfur代替,主要是为了速度。真实的df是数百万行长,有更多的条件和for循环需要永远。
我尝试了一个非常直观的解决方案,但它给出了错误的结果。因为我们使用group_by(DATE_M),dqr首先应用一种过滤器,并忽略当前值不在DATE_M中的所有行。就像我们在前面的代码中通过DD上的过滤器开始一样。这是我尝试的:

test_2 <- df %>% 
  group_by(DATE_M,SECTOR) %>% 
  summarize(SUM = sum(AMOUNT[which(DATE_1 <= DATE_M & 
                                     DATE_2 >= DATE_M)]))


结果是(我认为)false,具有较低的值,因为它错过了一些行:

> test_2 %>% arrange(DATE_M)
# A tibble: 6 x 3
# Groups:   DATE_M [3]
  DATE_M     SECTOR    SUM
  <date>     <chr>   <dbl>
1 2014-08-31 AAA     66150
2 2014-08-31 BBB     63000
3 2014-09-30 AAA     62690
4 2014-09-30 BBB     13800
5 2014-10-31 AAA    113170
6 2014-10-31 BBB      7500


有没有人可以帮助我在不使用(慢)for循环的情况下实现这一点?
希望这个问题能有意义。谢谢。

ehxuflar

ehxuflar1#

group_by(DATE_M)方法似乎确实没有达到你想要的效果,因为DATE_M列似乎本质上独立于其他列?比如,如果开始和结束日期也满足DATE_M的包含性,而DATE_M不出现在同一行中,你仍然希望它们与其他符合该条件的行相加?
有几种可能的方法可以做到这一点-我将在这里比较四个速度:

  • for循环(for_way()
  • purrr::map更整洁,但可能不会更快(map_way()
  • expand_grid以获取所有每行比较的完整架构(super_long_way()
  • data.table作为大 Dataframe 的更快替代方案(dt_way()

我已经在一个模拟的100,000行数据集上完成了这一操作,将DATE_M拆分为一组单独的月份进行测试(每个月份中的一些步骤可能不是严格必要的,但已确保所有四个结果都是相同的):

library(tidyverse)

df <- tibble(
  SECTOR = sample(c("AAA", "BBB"), 100000, replace = TRUE),
  AMOUNT = sample(1000:9000, 100000, replace = TRUE),
  DATE_1 = sample(ymd("2014-01-01"):ymd("2014-12-31"), 100000, replace = TRUE) |> as.Date(),
  DATE_2 = DATE_1 + sample(20:60, 100000, replace = TRUE)
)

DATE_M <- ymd("2014-01-01") + months(0:11)

for_way <-
  function() {
    test_1 = tibble(SECTOR = NULL,
                    SUM = NULL,
                    DATE_M = as.Date(NULL))
    
    
    for (DD in DATE_M) {
      tmp <- df %>%
        group_by(SECTOR) %>%
        filter(DATE_1 <= DD & DATE_2 >= DD) %>%
        summarize(SUM = sum(AMOUNT)) %>%
        mutate(DATE_M = as.Date(DD))
      
      test_1 <- bind_rows(test_1, tmp)
    }
    
    test_1
  }

map_way <- function() {
  DATE_M %>% 
    map(\(DD) {
      df %>%
        group_by(SECTOR) %>%
        filter(DATE_1 <= DD & DATE_2 >= DD) %>%
        summarize(SUM = sum(AMOUNT)) %>%
        mutate(DATE_M = as.Date(DD))
    }) |> 
    reduce(bind_rows) %>%
    select(DATE_M, SECTOR, SUM)
  
}

super_long_way <- function() {
  
  tibble(DATE_M) |>
    expand_grid(df) |>
    group_by(DATE_M, SECTOR) %>%
    summarize(SUM = sum(AMOUNT[which(DATE_1 <= DATE_M &
                                       DATE_2 >= DATE_M)]), .groups = "drop")
  
}

library(data.table)

dt <- data.table(df)

dt_way <- function() {
  
  lapply(DATE_M, \(DD){
    dt[dt[, .I[DATE_1 <= DD & DATE_2 >= DD], by = .(SECTOR)]$V1, 
           .(SUM = sum(AMOUNT)), keyby = .(SECTOR)][, `:=`(DATE_M = as.Date(..DD))]
  }) %>%
    rbindlist() %>%
    as_tibble() %>%
    select(DATE_M, SECTOR, SUM)
  
}


# All give the same results:
# for_way()
# map_way()
# super_long_way()
# dt_way()

library(microbenchmark)

microbenchmark(for_way(), map_way(), super_long_way(), dt_way(), times = 100, check = "equivalent")
#> Unit: milliseconds
#>              expr      min       lq      mean   median       uq      max neval
#>         for_way() 160.3486 178.0511 202.83757 192.5698 214.9194 313.6742   100
#>         map_way() 165.5709 181.3580 206.39011 194.2887 225.6654 314.0634   100
#>  super_long_way()  74.3093  94.2489 138.58590 152.7153 164.2177 234.6803   100
#>          dt_way()  64.3447  74.6397  87.02955  81.6293  92.8857 163.1749   100
#>  cld
#>  a  
#>  a  
#>   b 
#>    c

字符串
看起来super_long_way()dt_way()是最快的,data.table作为一种使用大数据的方便方法胜出。
有用的提示:我对data.table的语法不是很熟悉,但我倾向于使用dtplyr来构建它。返回的Call部分为您提供了tidyverse堆栈的data.table翻译:

library(dtplyr)

DD <- DATE_M[1]

df |> 
  lazy_dt() |> 
  group_by(SECTOR) %>%
  filter(DATE_1 <= DD & DATE_2 >= DD) %>%
  summarize(SUM = sum(AMOUNT)) %>%
  mutate(DATE_M = as.Date(DD))
#> Source: local data table [2 x 3]
#> Call:   `_DT1`[`_DT1`[, .I[DATE_1 <= DD & DATE_2 >= DD], by = .(SECTOR)]$V1, 
#>     .(SUM = sum(AMOUNT)), keyby = .(SECTOR)][, `:=`(DATE_M = as.Date(..DD))]
#> 
#>   SECTOR    SUM DATE_M    
#>   <chr>   <int> <date>    
#> 1 AAA    584957 2014-01-01
#> 2 BBB    581739 2014-01-01
#> 
#> # Use as.data.table()/as.data.frame()/as_tibble() to access results

相关问题