将嵌套的JSON文件转换为pandas Dataframe

bq9c1y66  于 2023-11-20  发布在  其他
关注(0)|答案(1)|浏览(136)

我试图将嵌套的JSON文件与列表和字典转换为 Dataframe 。到目前为止,我已经写了这段代码:

  1. for result in rawData['results']:
  2. openfda_data = result.get('openfda', {})
  3. # Collect 'openfda' data
  4. openfda_entries = {
  5. key: [value for value in values] if isinstance(values, list) else [values]
  6. for key, values in openfda_data.items()
  7. }
  8. # Flatten the 'submissions' data
  9. for submission in result.get('submissions', []):
  10. flattened_entry = {
  11. 'application_number': result['application_number'],
  12. 'sponsor_name': result['sponsor_name'],
  13. 'submission_type': submission.get('submission_type', None),
  14. 'submission_number': submission.get('submission_number', None),
  15. 'submission_status': submission.get('submission_status', None),
  16. 'submission_status_date': submission.get('submission_status_date', None),
  17. 'submission_class_code': submission.get('submission_class_code', None),
  18. 'submission_class_code_description': submission.get('submission_class_code_description', None)
  19. }
  20. # Add 'openfda' data
  21. for key, values in openfda_entries.items():
  22. flattened_entry[f'openfda_{key}'] = values # Store all values in a list
  23. flattened_data.append(flattened_entry)
  24. # Flatten the 'products' data
  25. for product in result.get('products', []):
  26. flattened_product = {
  27. 'application_number': result['application_number'],
  28. 'sponsor_name': result['sponsor_name'],
  29. 'product_number': product.get('product_number', None),
  30. 'reference_drug': product.get('reference_drug', None),
  31. 'brand_name': product.get('brand_name', None),
  32. 'active_ingredient_name': product['active_ingredients'][0].get('name', None),
  33. 'active_ingredient_strength': product['active_ingredients'][0].get('strength', None),
  34. 'reference_standard': product.get('reference_standard', None),
  35. 'dosage_form': product.get('dosage_form', None),
  36. 'route': product.get('route', None),
  37. 'marketing_status': product.get('marketing_status', None)
  38. }
  39. flattened_data.append(flattened_product)

字符串
这段代码能够flaten大部分的数据,以重新命名
问题在于openfda值。来自openfda条目的值在结果 Dataframe 中的列表中,如下所示:
[“首选关节炎8小时”、“醋氨酚- APAP 8小时”、“醋氨酚”、"关节炎疼痛缓解剂“、”首选关节炎疼痛缓解剂“、”醋氨酚- APAP关节炎“、”关节炎疼痛缓解剂“]
我还想把这些值变平,这样这个值也在列表之外。

  1. "results": [
  2. {
  3. "submissions": [
  4. {
  5. "submission_type": "ORIG",
  6. "submission_number": "1",
  7. "submission_status": "AP",
  8. "submission_status_date": "20021021"
  9. },
  10. {
  11. "submission_type": "SUPPL",
  12. "submission_number": "6",
  13. "submission_status": "AP",
  14. "submission_status_date": "20110610",
  15. "submission_class_code": "LABELING",
  16. "submission_class_code_description": "Labeling"
  17. }
  18. ],
  19. "application_number": "ANDA076177",
  20. "sponsor_name": "DR REDDYS LABS SA",
  21. "openfda": {
  22. "application_number": [
  23. "ANDA076177"
  24. ],
  25. "brand_name": [
  26. "CAMILA"
  27. ],
  28. "generic_name": [
  29. "NORETHINDRONE"
  30. ],
  31. "manufacturer_name": [
  32. "Mayne Pharma Inc."
  33. ],
  34. "product_ndc": [
  35. "51862-884"
  36. ],
  37. "product_type": [
  38. "HUMAN PRESCRIPTION DRUG"
  39. ],
  40. "route": [
  41. "ORAL"
  42. ],
  43. "substance_name": [
  44. "NORETHINDRONE"
  45. ],
  46. "rxcui": [
  47. "198042",
  48. "748961",
  49. "748962"
  50. ],
  51. "spl_id": [
  52. "022c6de4-4589-4d4c-8ac4-977c7e3ef568"
  53. ],
  54. "spl_set_id": [
  55. "a786be85-49ba-4369-b510-7dccc10f7f18"
  56. ],
  57. "package_ndc": [
  58. "51862-884-01",
  59. "51862-884-03"
  60. ],
  61. "nui": [
  62. "M0447349",
  63. "N0000175602"
  64. ],
  65. "pharm_class_cs": [
  66. "Progesterone Congeners [CS]"
  67. ],
  68. "pharm_class_epc": [
  69. "Progestin [EPC]"
  70. ],
  71. "unii": [
  72. "T18F433X4S"
  73. ]
  74. },
  75. "products": [
  76. {
  77. "product_number": "001",
  78. "reference_drug": "No",
  79. "brand_name": "CAMILA",
  80. "active_ingredients": [
  81. {
  82. "name": "NORETHINDRONE",
  83. "strength": "0.35MG"
  84. }
  85. ],
  86. "reference_standard": "No",
  87. "dosage_form": "TABLET",
  88. "route": "ORAL-28",
  89. "marketing_status": "Prescription",
  90. "te_code": "AB1"
  91. }
  92. ]
  93. },
  94. {
  95. "submissions": [
  96. {
  97. "submission_type": "SUPPL",
  98. "submission_number": "37",
  99. "submission_status": "AP",
  100. "submission_status_date": "20200707",
  101. "review_priority": "STANDARD",
  102. "submission_class_code": "LABELING",
  103. "submission_class_code_description": "Labeling"
  104. },
  105. {
  106. "submission_type": "SUPPL",
  107. "submission_number": "31",
  108. "submission_status": "AP",
  109. "submission_status_date": "20130910",
  110. "review_priority": "STANDARD",
  111. "submission_class_code": "LABELING",
  112. "submission_class_code_description": "Labeling"
  113. },
  114. {
  115. "submission_type": "SUPPL",
  116. "submission_number": "13",
  117. "submission_status": "AP",
  118. "submission_status_date": "20080129",
  119. "submission_class_code": "LABELING",
  120. "submission_class_code_description": "Labeling"
  121. },
  122. {
  123. "submission_type": "SUPPL",
  124. "submission_number": "5",
  125. "submission_status": "AP",
  126. "submission_status_date": "20040817",
  127. "submission_class_code": "LABELING",
  128. "submission_class_code_description": "Labeling"
  129. },
  130. {
  131. "submission_type": "SUPPL",
  132. "submission_number": "10",
  133. "submission_status": "AP",
  134. "submission_status_date": "20070110",
  135. "submission_class_code": "LABELING",
  136. "submission_class_code_description": "Labeling"
  137. }
  138. ],
  139. "application_number": "ANDA076194",
  140. "sponsor_name": "WATSON LABS",
  141. "openfda": {
  142. "application_number": [
  143. "ANDA076194"
  144. ],
  145. "brand_name": [
  146. "LISINOPRIL AND HYDROCHLOROTHIAZIDE"
  147. ],
  148. "generic_name": [
  149. "LISINOPRIL AND HYDROCHLOROTHIAZIDE"
  150. ],
  151. "manufacturer_name": [
  152. "Actavis Pharma, Inc."
  153. ],
  154. "product_ndc": [
  155. "0591-0860",
  156. "0591-0861",
  157. "0591-0862"
  158. ],
  159. "product_type": [
  160. "HUMAN PRESCRIPTION DRUG"
  161. ],
  162. "route": [
  163. "ORAL"
  164. ],
  165. "substance_name": [
  166. "HYDROCHLOROTHIAZIDE",
  167. "LISINOPRIL"
  168. ],
  169. "rxcui": [
  170. "197885",
  171. "197886",
  172. "197887"
  173. ],
  174. "spl_id": [
  175. "ff5e5610-775f-48a9-a410-e8d8b547b958"
  176. ],
  177. "spl_set_id": [
  178. "ab5e9ae4-e575-4a63-99f3-2bde36bdd508"
  179. ],
  180. "package_ndc": [
  181. "0591-0860-01",
  182. "0591-0860-05",
  183. "0591-0861-01",
  184. "0591-0861-05",
  185. "0591-0862-01",
  186. "0591-0862-05"
  187. ],
  188. "nui": [
  189. "N0000175359",
  190. "N0000175419",
  191. "M0471776"
  192. ],
  193. "pharm_class_pe": [
  194. "Increased Diuresis [PE]"
  195. ],
  196. "pharm_class_epc": [
  197. "Thiazide Diuretic [EPC]"
  198. ],
  199. "pharm_class_cs": [
  200. "Thiazides [CS]"
  201. ],
  202. "unii": [
  203. "E7199S1YWR",
  204. "0J48LPH2TH"
  205. ]
  206. },
  207. "products": [
  208. {
  209. "product_number": "003",
  210. "reference_drug": "No",
  211. "brand_name": "LISINOPRIL AND HYDROCHLOROTHIAZIDE",
  212. "active_ingredients": [
  213. {
  214. "name": "HYDROCHLOROTHIAZIDE",
  215. "strength": "12.5MG"
  216. },
  217. {
  218. "name": "LISINOPRIL",
  219. "strength": "10MG"
  220. }
  221. ],
  222. "reference_standard": "No",
  223. "dosage_form": "TABLET",
  224. "route": "ORAL",
  225. "marketing_status": "Prescription",
  226. "te_code": "AB"
  227. },
  228. {
  229. "product_number": "001",
  230. "reference_drug": "No",
  231. "brand_name": "LISINOPRIL AND HYDROCHLOROTHIAZIDE",
  232. "active_ingredients": [
  233. {
  234. "name": "HYDROCHLOROTHIAZIDE",
  235. "strength": "12.5MG"
  236. },
  237. {
  238. "name": "LISINOPRIL",
  239. "strength": "20MG"
  240. }
  241. ],
  242. "reference_standard": "No",
  243. "dosage_form": "TABLET",
  244. "route": "ORAL",
  245. "marketing_status": "Prescription",
  246. "te_code": "AB"
  247. },
  248. {
  249. "product_number": "002",
  250. "reference_drug": "No",
  251. "brand_name": "LISINOPRIL AND HYDROCHLOROTHIAZIDE",
  252. "active_ingredients": [
  253. {
  254. "name": "HYDROCHLOROTHIAZIDE",
  255. "strength": "25MG"
  256. },
  257. {
  258. "name": "LISINOPRIL",
  259. "strength": "20MG"
  260. }
  261. ],
  262. "reference_standard": "No",
  263. "dosage_form": "TABLET",
  264. "route": "ORAL",
  265. "marketing_status": "Prescription",
  266. "te_code": "AB"
  267. }
  268. ]
  269. }
  270. ]


我有办法做到吗?

hmtdttj4

hmtdttj41#

我没有看到你提供的任何预期输出。但你可以使用这样的东西:

  1. df = pd.json_normalize(json_file["results"])
  2. for i in df.columns:
  3. df = df.explode(i)
  4. df = df.join(pd.json_normalize(df.pop("submissions"))).reset_index(drop=True)
  5. df = df.join(pd.json_normalize(df.pop("products"))).reset_index(drop=True)
  6. df = df.explode("active_ingredients")
  7. df = df.join(pd.json_normalize(df.pop("active_ingredients"))).reset_index(drop=True)
  8. df = df.drop_duplicates()
  9. len(df) # 3252

字符串

退出

  1. | | application_number | sponsor_name | openfda.application_number | openfda.brand_name | openfda.generic_name | openfda.manufacturer_name | openfda.product_ndc | openfda.product_type | openfda.route | openfda.substance_name | openfda.rxcui | openfda.spl_id | openfda.spl_set_id | openfda.package_ndc | openfda.nui | openfda.pharm_class_cs | openfda.pharm_class_epc | openfda.unii | openfda.pharm_class_pe | submission_type | submission_number | submission_status | submission_status_date | submission_class_code | submission_class_code_description | review_priority | product_number | reference_drug | brand_name | reference_standard | dosage_form | route | marketing_status | te_code | name | strength |
  2. |---:|:---------------------|:------------------|:-----------------------------|:---------------------|:-----------------------|:----------------------------|:----------------------|:------------------------|:----------------|:-------------------------|----------------:|:-------------------------------------|:-------------------------------------|:----------------------|:--------------|:----------------------------|:--------------------------|:---------------|-------------------------:|:------------------|--------------------:|:--------------------|-------------------------:|------------------------:|------------------------------------:|------------------:|-----------------:|:-----------------|:-------------|:---------------------|:--------------|:--------|:-------------------|:----------|:--------------|:-----------|
  3. | 0 | ANDA076177 | DR REDDYS LABS SA | ANDA076177 | CAMILA | NORETHINDRONE | Mayne Pharma Inc. | 51862-884 | HUMAN PRESCRIPTION DRUG | ORAL | NORETHINDRONE | 198042 | 022c6de4-4589-4d4c-8ac4-977c7e3ef568 | a786be85-49ba-4369-b510-7dccc10f7f18 | 51862-884-01 | M0447349 | Progesterone Congeners [CS] | Progestin [EPC] | T18F433X4S | nan | ORIG | 1 | AP | 20021021 | nan | nan | nan | 001 | No | CAMILA | No | TABLET | ORAL-28 | Prescription | AB1 | NORETHINDRONE | 0.35MG |
  4. | 1 | ANDA076177 | DR REDDYS LABS SA | ANDA076177 | CAMILA | NORETHINDRONE | Mayne Pharma Inc. | 51862-884 | HUMAN PRESCRIPTION DRUG | ORAL | NORETHINDRONE | 198042 | 022c6de4-4589-4d4c-8ac4-977c7e3ef568 | a786be85-49ba-4369-b510-7dccc10f7f18 | 51862-884-01 | N0000175602 | Progesterone Congeners [CS] | Progestin [EPC] | T18F433X4S | nan | ORIG | 1 | AP | 20021021 | nan | nan | nan | 001 | No | CAMILA | No | TABLET | ORAL-28 | Prescription | AB1 | NORETHINDRONE | 0.35MG |
  5. | 2 | ANDA076177 | DR REDDYS LABS SA | ANDA076177 | CAMILA | NORETHINDRONE | Mayne Pharma Inc. | 51862-884 | HUMAN PRESCRIPTION DRUG | ORAL | NORETHINDRONE | 198042 | 022c6de4-4589-4d4c-8ac4-977c7e3ef568 | a786be85-49ba-4369-b510-7dccc10f7f18 | 51862-884-03 | M0447349 | Progesterone Congeners [CS] | Progestin [EPC] | T18F433X4S | nan | ORIG | 1 | AP | 20021021 | nan | nan | nan | 001 | No | CAMILA | No | TABLET | ORAL-28 | Prescription | AB1 | NORETHINDRONE | 0.35MG |
  6. | 3 | ANDA076177 | DR REDDYS LABS SA | ANDA076177 | CAMILA | NORETHINDRONE | Mayne Pharma Inc. | 51862-884 | HUMAN PRESCRIPTION DRUG | ORAL | NORETHINDRONE | 198042 | 022c6de4-4589-4d4c-8ac4-977c7e3ef568 | a786be85-49ba-4369-b510-7dccc10f7f18 | 51862-884-03 | N0000175602 | Progesterone Congeners [CS] | Progestin [EPC] | T18F433X4S | nan | ORIG | 1 | AP | 20021021 | nan | nan | nan | 001 | No | CAMILA | No | TABLET | ORAL-28 | Prescription | AB1 | NORETHINDRONE | 0.35MG |
  7. | 4 | ANDA076177 | DR REDDYS LABS SA | ANDA076177 | CAMILA | NORETHINDRONE | Mayne Pharma Inc. | 51862-884 | HUMAN PRESCRIPTION DRUG | ORAL | NORETHINDRONE | 748961 | 022c6de4-4589-4d4c-8ac4-977c7e3ef568 | a786be85-49ba-4369-b510-7dccc10f7f18 | 51862-884-01 | M0447349 | Progesterone Congeners [CS] | Progestin [EPC] | T18F433X4S | nan | ORIG | 1 | AP | 20021021 | nan | nan | nan | 001 | No | CAMILA | No | TABLET | ORAL-28 | Prescription | AB1 | NORETHINDRONE | 0.35MG |


如果数据文件太大,我们可以尝试拆分框架:

  1. import numpy as np
  2. df = pd.json_normalize(json_file["results"])
  3. splitted_dfs = np.array_split(df, 4)
  4. del df
  5. final_df = pd.DataFrame()
  6. for loop_df in splitted_dfs:
  7. for i in loop_df.columns:
  8. loop_df = loop_df.explode(i)
  9. loop_df = loop_df.join(pd.json_normalize(loop_df.pop("submissions"))).reset_index(drop=True)
  10. loop_df = loop_df.join(pd.json_normalize(loop_df.pop("products"))).reset_index(drop=True)
  11. loop_df = loop_df.explode("active_ingredients")
  12. loop_df = loop_df.join(pd.json_normalize(loop_df.pop("active_ingredients"))).reset_index(drop=True)
  13. final_df = pd.concat([final_df,loop_df])
  14. final_df = final_df.drop_duplicates()


或者我们可以把它添加到你写的代码中(我认为这是最快的方法):

  1. df = pd.DataFrame(flattened_data)
  2. final_df = pd.DataFrame()
  3. splitted_dfs = np.array_split(df, 4)
  4. del df
  5. for loop_df in splitted_dfs:
  6. for i in loop_df.columns:
  7. loop_df = loop_df.explode(i)
  8. final_df = pd.concat([final_df,loop_df])
  9. final_df = final_df.drop_duplicates()

展开查看全部

相关问题