python 将两个 Dataframe 与两个条件列合并

cunj1qz1  于 2023-02-02  发布在  Python
关注(0)|答案(2)|浏览(125)

我有两个表(A和B),我想从表B中相同ID的同一财政年度的最新data_date中获取每个ID在每个财政年度中的最新data_date的值。我在第三个表中有结果。我正在尝试用python编写此代码
表A

ID          data_date   fiscal_year
A           2016-03-31  2016          
A           2016-03-31  2016      
A           2018-09-31  2018            
B           2017-06-30  2017         
B           2017-09-30  2017     
B           2018-06-30  2018     
C           2013-03-31  2013

表B

ID          data_date   Value
A           2015-12-31     1   
A           2016-12-31     4  
A           2018-03-30     85           
B           2015-12-31     7  
B           2016-12-31     14   
B           2017-12-31     12 
C           2013-03-30     45 
C           2013-12-31     9  
C           2014-12-31     64 
C           2015-12-31     25

结果

ID          data_date   fiscal_year Value
A           2016-03-31  2016          4
A           2016-03-31  2016          4
A           2018-09-31  2018          85 [85 is the value of the latest date of B in 2018
B           2017-06-30  2017          NA [It is not the latest data date in 2017 for B]
B           2017-09-30  2017          12
B           2018-06-30  2018          NA [There is no value for B in 2018]  
C           2013-03-31  2013          9
bnlyeluc

bnlyeluc1#

IIUC,使用带有forward * 方向 * 的merge_asof
假设(df1)和(df2)是您的两个表/ Dataframe ,您可以使用以下命令:

df1["data_date"] = pd.to_datetime(df1["data_date"])
df2["data_date"] = pd.to_datetime(df2["data_date"])
df1 = df_A.sort_values(by="data_date")
df2 = df_B.sort_values(by="data_date")
​
out = (pd.merge_asof(df1, df2, on="data_date", by="ID",
                    allow_exact_matches=False, direction="forward")
           .sort_values(by="ID", ignore_index=True))

输出:

print(out)
​
  ID  data_date  fiscal_year  Value
0  A 2016-03-31         2016    4.0
1  A 2016-03-31         2016    4.0
2  A 2018-09-30         2018    NaN
3  B 2017-06-30         2017   12.0
4  B 2017-09-30         2017   12.0
5  B 2018-06-30         2018    NaN
6  C 2013-03-31         2013    9.0
7xzttuei

7xzttuei2#

要获得相同的输出

1.如果数据使用字典

dataDictA = [
#ID          data_date   fiscal_year
{"ID": "A", "data_date": "2016-03-31", "fiscal_year": 2016},
{"ID": "A", "data_date": "2016-03-31", "fiscal_year": 2016},
{"ID": "A", "data_date": "2018-09-31", "fiscal_year": 2018},
{"ID": "B", "data_date": "2017-06-30", "fiscal_year": 2017},
{"ID": "B", "data_date": "2017-09-30", "fiscal_year": 2017},
{"ID": "B", "data_date": "2018-06-30", "fiscal_year": 2018},
{"ID": "C", "data_date": "2013-03-31", "fiscal_year": 2013},
]

dataDictB = [
#ID          data_date   Value
{"ID": "A", "data_date": "2015-12-31", "Value": 1},
{"ID": "A", "data_date": "2016-12-31", "Value": 4},
{"ID": "A", "data_date": "2018-03-30", "Value": 85},
{"ID": "B", "data_date": "2015-12-31", "Value": 7},
{"ID": "B", "data_date": "2016-12-31", "Value": 14},
{"ID": "B", "data_date": "2017-12-31", "Value": 12},
{"ID": "C", "data_date": "2013-03-30", "Value": 45},
{"ID": "C", "data_date": "2013-12-31", "Value": 9},
{"ID": "C", "data_date": "2014-12-31", "Value": 64},
{"ID": "C", "data_date": "2015-12-31", "Value": 25},
]

def mergeDict(a,b):
    res = []
    for listA in a:
        year = listA["fiscal_year"]# or listA["data_date"].split("-")[0]
        tick = dateTick(listA["data_date"])

        latest = 0
        latestVal = "NA"
        
        #is it the latest date?
        previous = False
        for listA2 in a:
            if listA["ID"]==listA2["ID"]:#same ID
                if listA2["fiscal_year"] == year:#same year
                    if tick<dateTick(listA2["data_date"]):#not the latest
                        previous = True
                        break
        if previous:
            res.append(listA)
            res[-1]["Value"] = "NA"
            res[-1]["Reason"] = f"-not the lastest data in {year} for {listA2['ID']}"
            continue#skip if not the latest
        
        #check latest in dataB
        for listB in b:
            if listA["ID"]==listB["ID"]:#same ID
                if int(listB["data_date"].split("-")[0]) == year:#same year
                    tickB = dateTick(listB["data_date"])
                    if latest<tick:#get the latest value (compared in seconds)
                        latest = tickB
                        latestVal = listB["Value"]
        
        res.append(listA)
        res[-1]["Value"] = latestVal
        if latestVal == "NA":
            res[-1]["Reason"] = f"-no value for {listA['ID']} in {year}"
    return res

for v in mergeDict(dataDictA,dataDictB):
    print(v)

输出:

{'ID': 'A', 'data_date': '2016-03-31', 'fiscal_year': 2016, 'Value': 4}
{'ID': 'A', 'data_date': '2016-03-31', 'fiscal_year': 2016, 'Value': 4}
{'ID': 'A', 'data_date': '2018-09-31', 'fiscal_year': 2018, 'Value': 85}
{'ID': 'B', 'data_date': '2017-06-30', 'fiscal_year': 2017, 'Value': 'NA', 'Reason': '-not the lastest data in 2017 for B'}
{'ID': 'B', 'data_date': '2017-09-30', 'fiscal_year': 2017, 'Value': 12}
{'ID': 'B', 'data_date': '2018-06-30', 'fiscal_year': 2018, 'Value': 'NA', 'Reason': '-no value for B in 2018'}
{'ID': 'C', 'data_date': '2013-03-31', 'fiscal_year': 2013, 'Value': 9}

2.如果数据使用数组

dataA = [
#ID          data_date   fiscal_year
["A", "2016-03-31", 2016],
["A", "2016-03-31", 2016],
["A", "2018-09-31", 2018],
["B", "2017-06-30", 2017],
["B", "2017-09-30", 2017],
["B", "2018-06-30", 2018],
["C", "2013-03-31", 2013],
]

dataB = [
#ID          data_date   Value
["A", "2015-12-31", 1],
["A", "2016-12-31", 4],
["A", "2018-03-30", 85],
["B", "2015-12-31", 7],
["B", "2016-12-31", 14],
["B", "2017-12-31", 12],
["C", "2013-03-30", 45],
["C", "2013-12-31", 9],
["C", "2014-12-31", 64],
["C", "2015-12-31", 25],
]

#date to seconds
def dateTick(date):
    tick = date.split("-")
    return int(tick[0])*31536000+int(tick[1])*2628000+int(tick[2])*86400

def merge(a,b):
    res = []
    for listA in a:
        year = listA[2]# or listA[1].split("-")[0]
        tick = dateTick(listA[1])

        latest = 0
        latestVal = ["NA",f"-no value for {listA[0]} in {year}"]
        
        #is it the latest date?
        previous = False
        for listA2 in a:
            if listA[0]==listA2[0]:#same ID
                if listA2[2] == year:#same year
                    if tick<dateTick(listA2[1]):#not the latest
                        previous = True
                        break
        if previous:
            res.append(listA+["NA",f"-not the lastest data in {year} for {listA2[0]}"])
            continue#skip if not the latest
        
        #check latest in dataB
        for listB in b:
            if listA[0]==listB[0]:#same ID
                if int(listB[1].split("-")[0]) == year:#same year
                    tickB = dateTick(listB[1])
                    if latest<tick:#get the latest value (compared in seconds)
                        latest = tickB
                        latestVal = [listB[2]]
        
        res.append(listA+latestVal)
    return res

for v in merge(dataA,dataB):
    print(v)

输出:

['A', '2016-03-31', 2016, 4]
['A', '2016-03-31', 2016, 4]
['A', '2018-09-31', 2018, 85]
['B', '2017-06-30', 2017, 'NA', '-not the lastest data in 2017 for B']
['B', '2017-09-30', 2017, 12]
['B', '2018-06-30', 2018, 'NA', '-no value for B in 2018']
['C', '2013-03-31', 2013, 9]

相关问题