如何根据startdate和enddate的条件合并两个 Dataframe ?

zysjyyx4  于 2023-07-31  发布在  其他
关注(0)|答案(4)|浏览(92)

我有这个DF1

library(dplyr)
library(tidyverse)

df1 = data.frame(ID = c(100,101,101,102,102,103,103,104,104,105,106),
                 x_line = c(1,1,2,1,2,1,2,1,2,1,1),
                 start_date = c('04/01/2018','05/01/2019','25/08/2021','08/03/2017','07/08/2018',
                                '09/04/2016','29/12/2018','04/08/2018','03/05/2022','04/01/2018','04/01/2018'),
                 end_date = c('04/05/2019','07/02/2020','27/09/2021','18/07/2018','17/10/2019',
                              '19/12/2018','22/12/2019','14/09/2021','26/12/2022','15/02/2020','24/08/2020') 
                 )

字符集
以及以下DF2

df2 = data.frame(ID = c(100,100,100,101,101,102,102,103,103,104,104,105,105,106,106,106),
                 product_name = c('AA','BB','CC','AA','CC','DD','EE','DD','FF',
                                  'AA','FF','DD','AA','CC','AA','BB'),
                 start_taken_date = c('04/05/2018','25/08/2018','27/09/2018','18/07/2019','25/11/2019',
                                      '29/01/2018','07/09/2018','14/09/2017','01/01/2019','15/02/2019','24/08/2020',
                                      '04/03/2019','04/08/2018',
                                      '05/05/2018','06/06/2019','08/09/2018'),
                 end_taken_date = c('01/05/2019','26/09/2018','25/03/2019','25/09/2019','02/01/2020',
                                    '19/06/2018','22/09/2019','16/01/2018','04/03/2019','25/06/2022','23/07/2022',
                                    '05/04/2019','05/09/2018',
                                    '29/03/2019','07/07/2019','04/05/2020'))


df3是合并df1和df2的结果

df3 = df2%>%left_join(df1,.by=ID)


现在我希望创建df4,并满足以下条件(问题是它没有给予我想要的输出)

df4 = df3%>%mutate(line_m = ifelse(start_taken_date >=start_date & end_taken_date <= end_date,
                                   x_line,NA))


所需的最终输出如下

ID     product_name start_taken_date end_taken_date x_line
1  100           AA       04/05/2018     01/05/2019      1
2  100           BB       25/08/2018     26/09/2018      1
3  100           CC       27/09/2018     25/03/2019      1
4  101           AA       18/07/2019     25/09/2019      1
5  101           CC       25/11/2019     02/01/2020      1
6  102           DD       29/01/2018     19/06/2018      1
7  102           EE       07/09/2018     22/09/2019      2
8  103           DD       14/09/2017     16/01/2018      1
9  103           FF       01/01/2019     04/03/2019      2
10 104           AA       15/02/2019     25/06/2022      1
11 104           FF       24/08/2020     23/07/2022      1
12 105           DD       04/03/2019     05/04/2019      1
13 105           AA       04/08/2018     05/09/2018      1
14 106           CC       05/05/2018     29/03/2019      1
15 106           AA       06/06/2019     07/07/2019      1
16 106           BB       08/09/2018     04/05/2020      1

rt4zxlrg

rt4zxlrg1#

dplyr解决方案是使用join_by执行间隔连接。您需要安装dplyr 1.1.0才能使用此功能。

library(dplyr)
df1 <- mutate(df1, across(contains("date"), lubridate::dmy))
df2 <- mutate(df2, across(contains("date"), lubridate::dmy))

df2 %>% 
  left_join(df1, by = join_by("ID", start_taken_date >= start_date, end_taken_date <= end_date))

#     ID product_name start_taken_date end_taken_date x_line
# 1  100           AA       2018-05-04     2019-05-01      1
# 2  100           BB       2018-08-25     2018-09-26      1
# 3  100           CC       2018-09-27     2019-03-25      1
# 4  101           AA       2019-07-18     2019-09-25      1
# 5  101           CC       2019-11-25     2020-01-02      1
# 6  102           DD       2018-01-29     2018-06-19      1
# 7  102           EE       2018-09-07     2019-09-22      2
# 8  103           DD       2017-09-14     2018-01-16      1
# 9  103           FF       2019-01-01     2019-03-04      2
# 10 104           AA       2019-02-15     2022-06-25     NA
# 11 104           FF       2020-08-24     2022-07-23     NA
# 12 105           DD       2019-03-04     2019-04-05      1
# 13 105           AA       2018-08-04     2018-09-05      1
# 14 106           CC       2018-05-05     2019-03-29      1
# 15 106           AA       2019-06-06     2019-07-07      1
# 16 106           BB       2018-09-08     2020-05-04      1

字符集

cmssoen2

cmssoen22#

主要问题是left_join函数没有考虑日期范围或x_line变量。当一个ID在df1中有多行时,这会导致问题,因为它匹配所有可能的组合,而不考虑日期范围。
请将date列转换为Date类型,因为目前它们是字符。这可能导致不正确的比较。你可以使用lubridate来进行日期转换。

# Make sure these libraries are installed
library(dplyr)
library(fuzzyjoin)
library(lubridate)

# Convert date columns from characters to Date
df1$start_date <- dmy(df1$start_date)
df1$end_date <- dmy(df1$end_date)
df2$start_taken_date <- dmy(df2$start_taken_date)
df2$end_taken_date <- dmy(df2$end_taken_date)

# Use fuzzyjoin to merge based on the condition of start_date and end_date
df4 <- df2 %>%
  fuzzy_left_join(df1, 
                  by = c("ID" = "ID", 
                         "start_taken_date" = "start_date", 
                         "end_taken_date" = "end_date"), 
                  match_fun = list(`==`, `>=`, `<=`)) %>%
  select(ID = ID.x, product_name, start_taken_date, end_taken_date, x_line)

print(df4)

字符集
match_fun参数是一个函数列表,这些函数应返回TRUE,以获取by参数中相应位置的匹配项。在本例中,我们检查ID是否相等,start_taken_date在start_date之后,end_taken_dateend_date之前。

lf5gs5x2

lf5gs5x23#

data.table

library(data.table)
lapply(list(df1, df2), setDT)
df1[df2, on = list(ID, start_date <= start_taken_date, end_date >= end_taken_date)]

     ID x_line start_date   end_date product_name
 1: 100      1 2018-05-04 2019-05-01           AA
 2: 100      1 2018-08-25 2018-09-26           BB
 3: 100      1 2018-09-27 2019-03-25           CC
 4: 101      1 2019-07-18 2019-09-25           AA
 5: 101      1 2019-11-25 2020-01-02           CC
 6: 102      1 2018-01-29 2018-06-19           DD
 7: 102      2 2018-09-07 2019-09-22           EE
 8: 103      1 2017-09-14 2018-01-16           DD
 9: 103      2 2019-01-01 2019-03-04           FF
10: 104     NA 2019-02-15 2022-06-25           AA
11: 104     NA 2020-08-24 2022-07-23           FF
12: 105      1 2019-03-04 2019-04-05           DD
13: 105      1 2018-08-04 2018-09-05           AA
14: 106      1 2018-05-05 2019-03-29           CC
15: 106      1 2019-06-06 2019-07-07           AA
16: 106      1 2018-09-08 2020-05-04           BB

字符集

hrirmatl

hrirmatl4#

**data.table**使用foverlaps的方法

library(data.table)

setDT(df1, key = c("ID", "start_date", "end_date"))
setDT(df2, key = c("ID", "start_taken_date", "end_taken_date"))

foverlaps(df2, df1, type = "within")

字符集

相关问题