在pyspark中删除数组内的列元素

k4ymrczo  于 2023-03-01  发布在  Spark
关注(0)|答案(1)|浏览(209)

我在PySpark中有一个Schema,当item_platform_id元素出现时,我需要从items数组中删除它。我使用drop进行了测试,但它不起作用。

root
 |-- MISSION_ID: string (nullable = true)
 |-- COUNTRY: string (nullable = true)
 |-- SPONSORED_MISSION: string (nullable = true)
 |-- MISSION_TYPE: string (nullable = true)
 |-- SPONSORED_SEGMENTATION: string (nullable = true)
 |-- START_DATE: timestamp (nullable = true)
 |-- END_DATE: timestamp (nullable = true)
 |-- CREATE_DATE: timestamp (nullable = true)
 |-- UPDATE_DATE: timestamp (nullable = true)
 |-- SPONSOR_PARTNER_ID: string (nullable = true)
 |-- CONSIDER_DELIVERY_WINDOW: boolean (nullable = true)
 |-- CONSIDER_BLOCK_LIST: boolean (nullable = true)
 |-- DIGITALIZATION_LEVEL: string (nullable = true)
 |-- ITEMS: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _id: string (nullable = true)
 |    |    |-- quantity: integer (nullable = true)
 |    |    |-- item_platform_id: string (nullable = true)
 |-- COMBOS: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _id: string (nullable = true)
 |    |    |-- comboId: integer (nullable = true)
 |    |    |-- quantity: integer (nullable = true)
 |-- ENABLED: boolean (nullable = true)

预期:

root
 |-- MISSION_ID: string (nullable = true)
 |-- COUNTRY: string (nullable = true)
 |-- SPONSORED_MISSION: string (nullable = true)
 |-- MISSION_TYPE: string (nullable = true)
 |-- SPONSORED_SEGMENTATION: string (nullable = true)
 |-- START_DATE: timestamp (nullable = true)
 |-- END_DATE: timestamp (nullable = true)
 |-- CREATE_DATE: timestamp (nullable = true)
 |-- UPDATE_DATE: timestamp (nullable = true)
 |-- SPONSOR_PARTNER_ID: string (nullable = true)
 |-- CONSIDER_DELIVERY_WINDOW: boolean (nullable = true)
 |-- CONSIDER_BLOCK_LIST: boolean (nullable = true)
 |-- DIGITALIZATION_LEVEL: string (nullable = true)
 |-- ITEMS: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _id: string (nullable = true)
 |    |    |-- quantity: integer (nullable = true)
 |-- COMBOS: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _id: string (nullable = true)
 |    |    |-- comboId: integer (nullable = true)
 |    |    |-- quantity: integer (nullable = true)
 |-- ENABLED: boolean (nullable = true)
qmb5sa22

qmb5sa221#

你可以在数组的结构体中检查字段是否存在,你可以使用dropFields从结构体中删除字段 (从spark 3.1.2开始可用)
范例

# sample data
data_ls = [
    (1, [(1,2,3), (4,5,6)])
]

data_sdf = spark.createDataFrame(data_ls, 'id int, items array<struct<_id: int, quantity: int, item_platform_id: int>>')

# +---+----------------------+
# |id |items                 |
# +---+----------------------+
# |1  |[{1, 2, 3}, {4, 5, 6}]|
# +---+----------------------+

# root
#  |-- id: integer (nullable = true)
#  |-- items: array (nullable = true)
#  |    |-- element: struct (containsNull = true)
#  |    |    |-- _id: integer (nullable = true)
#  |    |    |-- quantity: integer (nullable = true)
#  |    |    |-- item_platform_id: integer (nullable = true)

# check existence and remove if exists
if 'item_platform_id' in data_sdf.withColumn('field', func.col('items')[0]).select('field.*').columns:
    new_data_sdf = data_sdf. \
        withColumn('items', func.transform('items', lambda x: x.dropFields('item_platform_id')))

new_data_sdf.show(truncate=False)

# +---+----------------+
# |id |items           |
# +---+----------------+
# |1  |[{1, 2}, {4, 5}]|
# +---+----------------+

相关问题