我有一个嵌套的json,我可以使用下面的函数将其完全展平
# Flatten nested df
def flatten_df(nested_df):
for col in nested_df.columns:
array_cols = [ c[0] for c in nested_df.dtypes if c[1][:5] == 'array']
for col in array_cols:
nested_df =nested_df.withColumn(col, F.explode_outer(nested_df[col]))
nested_cols = [c[0] for c in nested_df.dtypes if c[1][:6] == 'struct']
if len(nested_cols) == 0:
return nested_df
flat_cols = [c[0] for c in nested_df.dtypes if c[1][:6] != 'struct']
flat_df = nested_df.select(flat_cols +
[F.col(nc+'.'+c).alias(nc+'_'+c)
for nc in nested_cols
for c in nested_df.select(nc+'.*').columns])
return flatten_df(flat_df)
我想分解嵌套结构,但不想一路扁平化。我只希望展平到第一层,并保持后续嵌套结构的原样。
这是我使用的Dataframe的模式。
root
|-- module: array (nullable = true)
| |-- element: array (containsNull = true)
| | |-- element: struct (containsNull = true)
| | | |-- chart: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- header: array (nullable = true)
| | | | | | |-- element: struct (containsNull = true)
| | | | | | | |-- alt: string (nullable = true)
| | | | | | | |-- assetId: string (nullable = true)
| | | | | | | |-- header: string (nullable = true)
| | | | | | | |-- height: string (nullable = true)
| | | | | | | |-- linkedid: string (nullable = true)
| | | | | | | |-- selected: boolean (nullable = true)
| | | | | | | |-- src: string (nullable = true)
| | | | | | | |-- styleCodes: string (nullable = true)
| | | | | | | |-- viewLarger: string (nullable = true)
| | | | | | | |-- width: string (nullable = true)
| | | | | |-- id: string (nullable = true)
| | | | | |-- row: array (nullable = true)
| | | | | | |-- element: struct (containsNull = true)
| | | | | | | |-- alt: string (nullable = true)
| | | | | | | |-- label: string (nullable = true)
| | | | | | | |-- value: array (nullable = true)
| | | | | | | | |-- element: string (containsNull = true)
| | | |-- header: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- decorators: array (nullable = true)
| | | | | | |-- element: string (containsNull = true)
| | | | | |-- id: string (nullable = true)
| | | | | |-- value: string (nullable = true)
| | | |-- id: string (nullable = true)
| | | |-- image: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- alt: string (nullable = true)
| | | | | |-- assetId: string (nullable = true)
| | | | | |-- id: string (nullable = true)
| | | | | |-- originalSrc: string (nullable = true)
| | | | | |-- src: string (nullable = true)
| | | | | |-- styleCodes: string (nullable = true)
| | | | | |-- viewLarger: boolean (nullable = true)
| | | |-- paragraph: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- decorators: array (nullable = true)
| | | | | | |-- element: array (containsNull = true)
| | | | | | | |-- element: struct (containsNull = true)
| | | | | | | | |-- style.bold: struct (nullable = true)
| | | | | | | | | |-- length: long (nullable = true)
| | | | | | | | | |-- offset: long (nullable = true)
| | | | | |-- id: string (nullable = true)
| | | | | |-- value: array (nullable = true)
| | | | | | |-- element: string (containsNull = true)
我想要的最后一个Dataframe是
module_id | module_header | paragraph_id | paragraph_value | image_id | image_src| chart_id | chart_header | chart_row |
这里的'module\u id'是module array下的'id',module\u header是module array下的header array,依此类推。我需要在第一层停止压扁,而不是一直走到最后。
暂无答案!
目前还没有任何答案,快来回答吧!