python 将两个 Dataframe 与两个条件列合并

cunj1qz1  于 2023-02-02  发布在  Python
关注(0)|答案(2)|浏览(144)

我有两个表(A和B),我想从表B中相同ID的同一财政年度的最新data_date中获取每个ID在每个财政年度中的最新data_date的值。我在第三个表中有结果。我正在尝试用python编写此代码
表A

  1. ID data_date fiscal_year
  2. A 2016-03-31 2016
  3. A 2016-03-31 2016
  4. A 2018-09-31 2018
  5. B 2017-06-30 2017
  6. B 2017-09-30 2017
  7. B 2018-06-30 2018
  8. C 2013-03-31 2013

表B

  1. ID data_date Value
  2. A 2015-12-31 1
  3. A 2016-12-31 4
  4. A 2018-03-30 85
  5. B 2015-12-31 7
  6. B 2016-12-31 14
  7. B 2017-12-31 12
  8. C 2013-03-30 45
  9. C 2013-12-31 9
  10. C 2014-12-31 64
  11. C 2015-12-31 25

结果

  1. ID data_date fiscal_year Value
  2. A 2016-03-31 2016 4
  3. A 2016-03-31 2016 4
  4. A 2018-09-31 2018 85 [85 is the value of the latest date of B in 2018
  5. B 2017-06-30 2017 NA [It is not the latest data date in 2017 for B]
  6. B 2017-09-30 2017 12
  7. B 2018-06-30 2018 NA [There is no value for B in 2018]
  8. C 2013-03-31 2013 9
bnlyeluc

bnlyeluc1#

IIUC,使用带有forward * 方向 * 的merge_asof
假设(df1)和(df2)是您的两个表/ Dataframe ,您可以使用以下命令:

  1. df1["data_date"] = pd.to_datetime(df1["data_date"])
  2. df2["data_date"] = pd.to_datetime(df2["data_date"])
  3. df1 = df_A.sort_values(by="data_date")
  4. df2 = df_B.sort_values(by="data_date")
  5. out = (pd.merge_asof(df1, df2, on="data_date", by="ID",
  6. allow_exact_matches=False, direction="forward")
  7. .sort_values(by="ID", ignore_index=True))

输出:

  1. print(out)
  2. ID data_date fiscal_year Value
  3. 0 A 2016-03-31 2016 4.0
  4. 1 A 2016-03-31 2016 4.0
  5. 2 A 2018-09-30 2018 NaN
  6. 3 B 2017-06-30 2017 12.0
  7. 4 B 2017-09-30 2017 12.0
  8. 5 B 2018-06-30 2018 NaN
  9. 6 C 2013-03-31 2013 9.0
展开查看全部
7xzttuei

7xzttuei2#

要获得相同的输出

1.如果数据使用字典

  1. dataDictA = [
  2. #ID data_date fiscal_year
  3. {"ID": "A", "data_date": "2016-03-31", "fiscal_year": 2016},
  4. {"ID": "A", "data_date": "2016-03-31", "fiscal_year": 2016},
  5. {"ID": "A", "data_date": "2018-09-31", "fiscal_year": 2018},
  6. {"ID": "B", "data_date": "2017-06-30", "fiscal_year": 2017},
  7. {"ID": "B", "data_date": "2017-09-30", "fiscal_year": 2017},
  8. {"ID": "B", "data_date": "2018-06-30", "fiscal_year": 2018},
  9. {"ID": "C", "data_date": "2013-03-31", "fiscal_year": 2013},
  10. ]
  11. dataDictB = [
  12. #ID data_date Value
  13. {"ID": "A", "data_date": "2015-12-31", "Value": 1},
  14. {"ID": "A", "data_date": "2016-12-31", "Value": 4},
  15. {"ID": "A", "data_date": "2018-03-30", "Value": 85},
  16. {"ID": "B", "data_date": "2015-12-31", "Value": 7},
  17. {"ID": "B", "data_date": "2016-12-31", "Value": 14},
  18. {"ID": "B", "data_date": "2017-12-31", "Value": 12},
  19. {"ID": "C", "data_date": "2013-03-30", "Value": 45},
  20. {"ID": "C", "data_date": "2013-12-31", "Value": 9},
  21. {"ID": "C", "data_date": "2014-12-31", "Value": 64},
  22. {"ID": "C", "data_date": "2015-12-31", "Value": 25},
  23. ]
  24. def mergeDict(a,b):
  25. res = []
  26. for listA in a:
  27. year = listA["fiscal_year"]# or listA["data_date"].split("-")[0]
  28. tick = dateTick(listA["data_date"])
  29. latest = 0
  30. latestVal = "NA"
  31. #is it the latest date?
  32. previous = False
  33. for listA2 in a:
  34. if listA["ID"]==listA2["ID"]:#same ID
  35. if listA2["fiscal_year"] == year:#same year
  36. if tick<dateTick(listA2["data_date"]):#not the latest
  37. previous = True
  38. break
  39. if previous:
  40. res.append(listA)
  41. res[-1]["Value"] = "NA"
  42. res[-1]["Reason"] = f"-not the lastest data in {year} for {listA2['ID']}"
  43. continue#skip if not the latest
  44. #check latest in dataB
  45. for listB in b:
  46. if listA["ID"]==listB["ID"]:#same ID
  47. if int(listB["data_date"].split("-")[0]) == year:#same year
  48. tickB = dateTick(listB["data_date"])
  49. if latest<tick:#get the latest value (compared in seconds)
  50. latest = tickB
  51. latestVal = listB["Value"]
  52. res.append(listA)
  53. res[-1]["Value"] = latestVal
  54. if latestVal == "NA":
  55. res[-1]["Reason"] = f"-no value for {listA['ID']} in {year}"
  56. return res
  57. for v in mergeDict(dataDictA,dataDictB):
  58. print(v)

输出:

  1. {'ID': 'A', 'data_date': '2016-03-31', 'fiscal_year': 2016, 'Value': 4}
  2. {'ID': 'A', 'data_date': '2016-03-31', 'fiscal_year': 2016, 'Value': 4}
  3. {'ID': 'A', 'data_date': '2018-09-31', 'fiscal_year': 2018, 'Value': 85}
  4. {'ID': 'B', 'data_date': '2017-06-30', 'fiscal_year': 2017, 'Value': 'NA', 'Reason': '-not the lastest data in 2017 for B'}
  5. {'ID': 'B', 'data_date': '2017-09-30', 'fiscal_year': 2017, 'Value': 12}
  6. {'ID': 'B', 'data_date': '2018-06-30', 'fiscal_year': 2018, 'Value': 'NA', 'Reason': '-no value for B in 2018'}
  7. {'ID': 'C', 'data_date': '2013-03-31', 'fiscal_year': 2013, 'Value': 9}

2.如果数据使用数组

  1. dataA = [
  2. #ID data_date fiscal_year
  3. ["A", "2016-03-31", 2016],
  4. ["A", "2016-03-31", 2016],
  5. ["A", "2018-09-31", 2018],
  6. ["B", "2017-06-30", 2017],
  7. ["B", "2017-09-30", 2017],
  8. ["B", "2018-06-30", 2018],
  9. ["C", "2013-03-31", 2013],
  10. ]
  11. dataB = [
  12. #ID data_date Value
  13. ["A", "2015-12-31", 1],
  14. ["A", "2016-12-31", 4],
  15. ["A", "2018-03-30", 85],
  16. ["B", "2015-12-31", 7],
  17. ["B", "2016-12-31", 14],
  18. ["B", "2017-12-31", 12],
  19. ["C", "2013-03-30", 45],
  20. ["C", "2013-12-31", 9],
  21. ["C", "2014-12-31", 64],
  22. ["C", "2015-12-31", 25],
  23. ]
  24. #date to seconds
  25. def dateTick(date):
  26. tick = date.split("-")
  27. return int(tick[0])*31536000+int(tick[1])*2628000+int(tick[2])*86400
  28. def merge(a,b):
  29. res = []
  30. for listA in a:
  31. year = listA[2]# or listA[1].split("-")[0]
  32. tick = dateTick(listA[1])
  33. latest = 0
  34. latestVal = ["NA",f"-no value for {listA[0]} in {year}"]
  35. #is it the latest date?
  36. previous = False
  37. for listA2 in a:
  38. if listA[0]==listA2[0]:#same ID
  39. if listA2[2] == year:#same year
  40. if tick<dateTick(listA2[1]):#not the latest
  41. previous = True
  42. break
  43. if previous:
  44. res.append(listA+["NA",f"-not the lastest data in {year} for {listA2[0]}"])
  45. continue#skip if not the latest
  46. #check latest in dataB
  47. for listB in b:
  48. if listA[0]==listB[0]:#same ID
  49. if int(listB[1].split("-")[0]) == year:#same year
  50. tickB = dateTick(listB[1])
  51. if latest<tick:#get the latest value (compared in seconds)
  52. latest = tickB
  53. latestVal = [listB[2]]
  54. res.append(listA+latestVal)
  55. return res
  56. for v in merge(dataA,dataB):
  57. print(v)

输出:

  1. ['A', '2016-03-31', 2016, 4]
  2. ['A', '2016-03-31', 2016, 4]
  3. ['A', '2018-09-31', 2018, 85]
  4. ['B', '2017-06-30', 2017, 'NA', '-not the lastest data in 2017 for B']
  5. ['B', '2017-09-30', 2017, 12]
  6. ['B', '2018-06-30', 2018, 'NA', '-no value for B in 2018']
  7. ['C', '2013-03-31', 2013, 9]
展开查看全部

相关问题