在spark中,如何从结构中提取数组并用它创建一个新字段

xghobddn  于 2021-05-27  发布在  Spark
关注(0)|答案(1)|浏览(400)

我有一个带有schema的struct:

root
 |-- id: long (nullable = true)
 |-- products: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: integer (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- created_at: long (nullable = true)
 |    |    |-- updated_at: long (nullable = true)
 |    |    |-- product_color: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- id: integer (nullable = true)
 |    |    |    |    |-- color: string (nullable = true)
 |    |    |    |    |-- created_at: long (nullable = true)
 |    |    |    |    |-- updated_at: long (nullable = true)
 |    |    |    |    |-- products_id: long (nullable = true)
 |    |    |-- orders_id: long (nullable = true)

现在,我想创建一个带有product\颜色的新列,以便在我的数据框中添加一个新列,如

df.withColumn("product_color", col(currentNode + "." + fieldName))

使用新列模式:

root
 |-- id: long (nullable = true)
 |-- products: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: integer (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- created_at: long (nullable = true)
 |    |    |-- updated_at: long (nullable = true)
 |    |    |-- product_color: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- id: integer (nullable = true)
 |    |    |    |    |-- color: string (nullable = true)
 |    |    |    |    |-- created_at: long (nullable = true)
 |    |    |    |    |-- updated_at: long (nullable = true)
 |    |    |    |    |-- products_id: long (nullable = true)
 |    |    |-- orders_id: long (nullable = true)
 |-- product_color: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- id: integer (nullable = true)
 |    |    |    |-- color: string (nullable = true)
 |    |    |    |-- created_at: long (nullable = true)
 |    |    |    |-- updated_at: long (nullable = true)
 |    |    |    |-- products_id: long (nullable = true)

如果查看产品颜色的模式,则会添加一个数组元素。

|-- element: array (containsNull = true)

我正在寻找帮助,以了解如何创建一个新的列与确切的模式,因为它是在产品结构。
预期架构:

root
 |-- id: long (nullable = true)
 |-- products: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: integer (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- created_at: long (nullable = true)
 |    |    |-- updated_at: long (nullable = true)
 |    |    |-- product_color: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- id: integer (nullable = true)
 |    |    |    |    |-- color: string (nullable = true)
 |    |    |    |    |-- created_at: long (nullable = true)
 |    |    |    |    |-- updated_at: long (nullable = true)
 |    |    |    |    |-- products_id: long (nullable = true)
 |    |    |-- orders_id: long (nullable = true)
 |-- product_color: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: integer (nullable = true)
 |    |    |-- color: string (nullable = true)
 |    |    |-- created_at: long (nullable = true)
 |    |    |-- updated_at: long (nullable = true)
 |    |    |-- products_id: long (nullable = true)

spark:2.4.5语言:scala

pgvzfuti

pgvzfuti1#

添加为时 new column 分解数组以获得所需的模式。 Example: ```
//sample df schema

df.printSchema
//root
// |-- id: long (nullable = true)
// |-- products: array (nullable = true)
// | |-- element: struct (containsNull = true)
// | | |-- id: long (nullable = true)
// | | |-- order_id: long (nullable = true)
// | | |-- product_color: array (nullable = true)
// | | | |-- element: struct (containsNull = true)
// | | | | |-- color: string (nullable = true)
// | | | | |-- id: long (nullable = true)
// | | | | |-- products_id: long (nullable = true)

df.withColumn("product_color",explode(col("products.product_color"))).printSchema
//root
// |-- id: long (nullable = true)
// |-- products: array (nullable = true)
// | |-- element: struct (containsNull = true)
// | | |-- id: long (nullable = true)
// | | |-- order_id: long (nullable = true)
// | | |-- product_color: array (nullable = true)
// | | | |-- element: struct (containsNull = true)
// | | | | |-- color: string (nullable = true)
// | | | | |-- id: long (nullable = true)
// | | | | |-- products_id: long (nullable = true)
// |-- product_color: array (nullable = true)
// | |-- element: struct (containsNull = true)
// | | |-- color: string (nullable = true)
// | | |-- id: long (nullable = true)
// | | |-- products_id: long (nullable = true)

相关问题