对于每个月(日期_M)在一个框架,我需要得到的总和金额,满足一个双重条件。我需要组_由扇区列,然后总和金额,如果满足以下条件:DATE_M上级或等于DATE_1中的值,并且DATE_M上级或等于DATE_中的值2.这不仅是当月的,而且是整个DF历史上的未付金额。
这里有一个小的可重复的例子:
df <- structure(list(
DATE_M = structure(c(16374, 16374, 16313, 16313,
16374, 16374, 16343, 16343, 16343, 16343, 16374, 16374, 16374,
16374, 16343, 16343, 16313, 16343, 16343, 16374, 16374, 16313,
16313, 16374, 16313, 16374, 16343, 16313, 16343, 16343, 16374,
16374, 16313, 16313, 16374, 16374, 16343, 16343, 16374, 16313,
16313, 16374, 16313, 16374, 16343, 16374, 16374, 16343, 16343,
16343, 16343, 16343, 16313, 16313, 16313, 16343, 16313, 16374,
16374, 16343, 16313, 16313, 16374, 16313, 16343, 16343, 16374,
16374, 16313, 16343, 16374, 16343, 16343, 16313, 16313, 16343,
16374, 16374, 16374, 16374, 16313, 16374, 16313, 16374, 16374,
16374, 16374, 16313, 16374, 16343, 16313, 16374, 16343, 16374,
16374, 16374, 16343, 16343, 16374, 16343, 16374, 16374, 16374,
16343, 16313, 16343, 16374, 16313, 16374, 16374, 16313, 16343,
16313, 16313, 16374, 16343, 16313, 16313, 16374, 16343, 16313,
16374, 16374, 16343, 16313, 16374, 16343, 16374), class = "Date"),
SECTOR = c("AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA",
"AAA", "AAA", "BBB", "AAA", "BBB", "AAA", "AAA", "AAA",
"AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA",
"AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA",
"AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "BBB", "AAA",
"AAA", "AAA", "BBB", "AAA", "AAA", "AAA", "AAA", "BBB",
"AAA", "AAA", "AAA", "BBB", "BBB", "AAA", "BBB",
"AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "BBB",
"AAA", "AAA", "AAA", "AAA", "BBB", "AAA", "AAA", "AAA",
"AAA", "BBB", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA",
"AAA", "AAA", "BBB", "AAA", "AAA", "AAA", "AAA", "AAA",
"AAA", "BBB", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA",
"AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA",
"AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "BBB",
"AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA",
"AAA", "AAA", "AAA", "AAA", "BBB"),
AMOUNT = c(2000, 2500,
900, 2500, 900, 2300, 1500, 1500, 5000, 9000, 1050, 2500,
3000, 3300, 3000, 2000, 2000, 5000, 2000, 1500, 1000, 600,
6000, 200, 1000, 2000, 2000, 1000, 3000, 500, 3300, 3000,
500, 4000, 6200, 1000, 3000, 1000, 7000, 1500, 1500, 2000,
2000, 5000, 1000, 2000, 1000, 1800, 4000, 2000, 800, 1500,
20000, 1500, 500, 23500, 1000, 1000, 2500, 500, 5000, 1000,
820, 2500, 500, 40, 3000, 550, 10000, 1000, 3500, 1500, 1000,
17000, 2500, 7000, 1200, 1000, 7000, 8500, 10000, 5000, 5500,
3000, 500, 6600, 1000, 1000, 1500, 800, 2500, 500, 1000,
3000, 4000, 600, 2000, 1800, 1100, 500, 1300, 1000, 1000,
250, 3850, 3000, 50, 2000, 3400, 500, 1500, 1700, 1600, 5000,
1500, 400, 1500, 1000, 2000, 2500, 8000, 700, 3000, 1200,
2200, 2000, 1900, 10000),
DATE_1 = structure(c(16371, 16353,
16287, 16294, 16374, 16365, 16329, 16321, 16321, 16324, 16367,
16360, 16373, 16353, 16343, 16332, 16287, 16316, 16328, 16345,
16367, 16302, 16311, 16352, 16303, 16351, 16321, 16300, 16331,
16337, 16370, 16363, 16310, 16300, 16367, 16352, 16321, 16317,
16363, 16308, 16302, 16367, 16293, 16346, 16332, 16359, 16371,
16335, 16339, 16324, 16323, 16331, 16303, 16293, 16286, 16316,
16286, 16356, 16372, 16335, 16287, 16288, 16371, 16300, 16332,
16332, 16359, 16350, 16303, 16338, 16370, 16342, 16338, 16303,
16311, 16328, 16344, 16352, 16349, 16356, 16303, 16346, 16289,
16372, 16349, 16344, 16366, 16311, 16359, 16336, 16294, 16356,
16338, 16358, 16367, 16359, 16338, 16325, 16365, 16339, 16359,
16370, 16350, 16321, 16286, 16323, 16374, 16311, 16345, 16356,
16290, 16322, 16310, 16300, 16352, 16317, 16289, 16301, 16373,
16317, 16294, 16366, 16344, 16325, 16303, 16344, 16329, 16356), class = c("IDate", "Date")),
DATE_2 = structure(c(16463,
16401, 16318, 16386, 16434, 16489, 16420, 16412, 16412, 16505,
16461, 16391, 16405, 16444, 16434, 16423, 16343, 16344, 16363,
16402, 16549, 16575, 16358, 16717, 16304, 16384, 16681, 16392,
16364, 16398, 16462, 16394, 16402, 16392, 16398, 16359, 16351,
16408, 16450, 16335, 16335, 16374, 16384, 16377, 16363, 16391,
16402, 16517, 16346, 16357, 16414, 16363, 16395, 16318, 16349,
16317, 16568, 16448, 16464, 16426, 16419, 16380, 16463, 16335,
16513, 16696, 16451, 16562, 16394, 16519, 16401, 16372, 16345,
16395, 16343, 16363, 16435, 16444, 16374, 16448, 16335, 16407,
16335, 16402, 16380, 16435, 16489, 16343, 16402, 16366, 16325,
16479, 16399, 16419, 16398, 16451, 16343, 16416, 16458, 16400,
16451, 16401, 16381, 16412, 16317, 16414, 16402, 16343, 16349,
16538, 16384, 16323, 16342, 16331, 16444, 16409, 16416, 16393,
16434, 16349, 16386, 16398, 16435, 16356, 16335, 16374, 16359,
16359), class = c("IDate", "Date"))),
row.names = c(NA, -128L),
class = c("data.table", "data.frame"))
字符串
这很容易用for循环来实现:
test_1 = tibble(SECTOR=NULL, SUM=NULL, DATE_M=as.Date(NULL))
for(DD in df$DATE_M %>% as.character %>% unique){
tmp <- df %>%
group_by(SECTOR) %>%
filter(DATE_1 <= DD & DATE_2 >= DD) %>%
summarize(SUM = sum(AMOUNT)) %>%
ungroup %>%
mutate(DATE_M=as.Date(DD))
test_1 <- bind_rows(test_1, tmp)
}
型
我相信这是正确的结果:
> test_1 %>% arrange(DATE_M)
# A tibble: 6 x 3
DATE_M SECTOR SUM
<date> <chr> <dbl>
1 2014-08-31 AAA 66150
2 2014-08-31 BBB 63000
3 2014-09-30 AAA 106290
4 2014-09-30 BBB 60800
5 2014-10-31 AAA 170860
6 2014-10-31 BBB 63500
型
我试着用dfur代替,主要是为了速度。真实的df是数百万行长,有更多的条件和for循环需要永远。
我尝试了一个非常直观的解决方案,但它给出了错误的结果。因为我们使用group_by(DATE_M)
,dqr首先应用一种过滤器,并忽略当前值不在DATE_M中的所有行。就像我们在前面的代码中通过DD
上的过滤器开始一样。这是我尝试的:
test_2 <- df %>%
group_by(DATE_M,SECTOR) %>%
summarize(SUM = sum(AMOUNT[which(DATE_1 <= DATE_M &
DATE_2 >= DATE_M)]))
型
结果是(我认为)false,具有较低的值,因为它错过了一些行:
> test_2 %>% arrange(DATE_M)
# A tibble: 6 x 3
# Groups: DATE_M [3]
DATE_M SECTOR SUM
<date> <chr> <dbl>
1 2014-08-31 AAA 66150
2 2014-08-31 BBB 63000
3 2014-09-30 AAA 62690
4 2014-09-30 BBB 13800
5 2014-10-31 AAA 113170
6 2014-10-31 BBB 7500
型
有没有人可以帮助我在不使用(慢)for循环的情况下实现这一点?
希望这个问题能有意义。谢谢。
1条答案
按热度按时间ehxuflar1#
group_by(DATE_M)
方法似乎确实没有达到你想要的效果,因为DATE_M
列似乎本质上独立于其他列?比如,如果开始和结束日期也满足DATE_M
的包含性,而DATE_M
不出现在同一行中,你仍然希望它们与其他符合该条件的行相加?有几种可能的方法可以做到这一点-我将在这里比较四个速度:
for_way()
)purrr::map
更整洁,但可能不会更快(map_way()
)expand_grid
以获取所有每行比较的完整架构(super_long_way()
)data.table
作为大 Dataframe 的更快替代方案(dt_way()
)我已经在一个模拟的100,000行数据集上完成了这一操作,将
DATE_M
拆分为一组单独的月份进行测试(每个月份中的一些步骤可能不是严格必要的,但已确保所有四个结果都是相同的):字符串
看起来
super_long_way()
和dt_way()
是最快的,data.table
作为一种使用大数据的方便方法胜出。有用的提示:我对
data.table
的语法不是很熟悉,但我倾向于使用dtplyr
来构建它。返回的Call
部分为您提供了tidyverse堆栈的data.table
翻译:型