我正在尝试创建一个表,该表将计算客户在一段时间内购物的月份中的支出权重。例如,以下客户(假数据)具有此支出配置文件:
/* Customer spend */
create or replace temp table ts_all_transactions
(
inferred_customer_id varchar(128)
,nw_date date
,spend number(21,2)
);
insert into ts_all_transactions
values
('52f5','2019-06-01',17.35)
,('52f5','2018-11-01',24.85)
,('52f5','2019-12-01',1.40)
,('52f5','2019-01-01',2.45)
,('52f5','2019-03-01',3.90)
,('52f5','2020-01-01',37.55)
,('52f5','2019-10-01',13.20)
,('52f5','2019-09-01',5.70)
;
然后创建一个日历,其中包含某个期间所在的月份以及权重:
-- Calculate weights for each period of the time series
-- Create a staging table
create or replace temp table period_dimension as
select abs(seq4()-12) as period,
dateadd(month, seq4(), dateadd(month, -23, date_trunc('Month', current_date()))) as start_date,
dateadd(month, 12, start_date) as end_date
from table(generator(rowcount => 12)) -- number of months after reference date in previous line
;
select * from period_dimension;
create or replace temp table my_date_dimension
(
my_date date not null
,year smallint not null
,month smallint not null
,month_name char(3) not null
,day_of_mon smallint not null
,day_of_week varchar(9) not null
,week_of_year smallint not null
,day_of_year smallint not null
)
as
with my_date as (
select
seq4(),
dateadd(month, seq4(), dateadd(month, -23, date_trunc('Month', current_date()))) as my_date
from table(generator(rowcount=>23))
)
select my_date
,year(my_date)
,month(my_date)
,monthname(my_date)
,day(my_date)
,dayofweek(my_date)
,weekofyear(my_date)
,dayofyear(my_date)
from my_date
;
create or replace table weight_lookup as
select
a.period
,b.my_date
,rank() over (partition by a.period order by b.my_date) as weight
from period_dimension a
inner join my_date_dimension b
where b.my_date >= a.start_date
and b.my_date < a.end_date
order by 1,2
;
-- Create a staging table
create or replace temp table period_dimension2 as
select abs(seq4()-12) as period,
dateadd(month, seq4(), dateadd(month, -23, date_trunc('Month', current_date()))) as start_date,
last_day(dateadd(month, 11, start_date)) as end_date
from table(generator(rowcount => 12)) -- number of months after reference date in previous line
;
然后,使用上面的公式来计算一个基于客户在这段时间内购物的月份的平均花费,但是,我没有得到我期望的结果:
-- For each month of each period, group all together by period here so we have 12 periods
-- so each period represents 12 rolling months with period 12 being the oldest period
create or replace temp table ts_spend_time as
select
a.inferred_customer_id
,b.period
,max(a.nw_date) as max_mnth /* Month in period where most spend was made */
,sum(a.spend * b.weight) / 78 as avg_spend /* Sum of weights 12,11,10...1 to give 78 */
from ts_all_transactions a
inner join weight_lookup b on a.nw_date = b.my_date
inner join period_dimension2 c on b.my_date = c.start_date and b.period = c.period
where b.my_date >= c.start_date
and b.my_date <= c.end_date
group by 1,2
order by 1 desc, 2,3
;
我从上述代码得到的输出如下:
create or replace temp table ts_spend_time_wrong_out
(
inferred_customer_id varchar(128)
,period number(11)
,max_mnth date
,avg_spend number(38,8)
);
insert into ts_spend_time_wrong_out
values
('52f5',3,'2019-03-01',0.05000000)
,('52f5',5,'2019-01-01',0.03141026)
,('52f5',7,'2018-11-01',0.31858974)
;
我想得到这样的输出:
create or replace temp table ts_spend_time_should_be
(
inferred_customer_id varchar(128)
,period number(11)
,max_mnth date
,avg_spend number(38,8)
);
insert into ts_spend_time_should_be
values
('52f5',1,'01JAN2020',6.301923077)
,('52f5',2,'01JAN2020',7.266025641)
,('52f5',3,'01JAN2020',8.280128205)
,('52f5',4,'01JAN2020',9.294230769)
,('52f5',5,'01DEC2019',4.081410256)
,('52f5',6,'01OCT2019',4.412179487)
,('52f5',7,'01OCT2019',5.276923077)
,('52f5',8,'01SEP2019',3.941666667)
,('52f5',9,'01JUN2019',3.687179487)
,('52f5',10,'01JUN2019',4.309615385)
,('52f5',11,'01JUN2019',4.932051282)
,('52f5',12,'01MAR2019',2.662820513)
;
在正确的解决方案示例中,平均支出按时段计算如下:((17.352)+(5.75)+(13.206)+(1.48)+(37.55*9))/78
我如何解决这个问题?短暂性脑缺血发作
1条答案
按热度按时间cwdobuhd1#
首先你应该使用
row_number() over(order by seq4())
因为在一个seq()
你的问题已经解决了一半给予:
到目前为止,我们可以看到“加权平均”值的输入,可以通过以下方式完成:
它给出:
一开始是一样的,但有分歧,因为我认为你的约会周期是“错误的”
但下一点是你有这条线
但这并不像你说的那样。。它所做的是在聚合中查找最大日期值。
要做到这一点,您需要返回到monthly results sql并将第一个\u值()放入混合中,然后通过以下方式选择结果:
现在符合您的期望: