如何用spark中的单个逗号替换字符串列中的多个逗号

7rfyedvj  于 2021-05-29  发布在  Spark
关注(0)|答案(2)|浏览(484)

我有一个spark数据框

val df = Seq(
(",,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,,                                    "),
(",,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,,                                    "),
(",,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,,                                    "),
(",,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,160,,162,,,,,,,,,,,,174,,176,,,,,,,,,,,,,                                    "),
(",,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,,                                    "),
(",,,,104,,,,,,,111,,,,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,160,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,,                                    "),
(",,,103,104,,,,,,,111,,,114,,,,,,,121,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,,                              "),
(",101,102,,104,,,,,,,,,113,114,,,,,,,,,,,,,,,,130,131,,,,,,,,,,141,142,143,,,146,,,,150,,152,,,,,157,,,,,162,,,,,,,,,,,,174,,,,,,,,,,184,,,,,            "),
(",,,,104,,,,,,,,,113,,,,,,,,,,,,,,,,,,131,,,,,,,,,,141,142,143,,,146,,,,150,,,,,155,,157,,,,,162,,,,,,,169,,,,,174,,176,177,178,,,,,,,,,,,               "),
(",,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,,,,,,174,,176,,,,,,,,,,,,,                                       "),
(",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,                                                               "),
(",,102,,104,,,,,,,,,113,114,,,,,,,,,,,,,,,,130,131,,,,,,,,,,141,142,143,,,146,,,,150,,152,,,,,157,,,,,162,,,,,,,,,,,,174,,,,,,,,,,,,,,,                  "),
(",,,,104,,,,,,,111,112,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,,                                 "),
(",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,                                                               "),
(",,102,103,104,,,,,,,,,113,114,,,,,,,121,,,,,,,,,130,131,,,,,,,,,,141,142,143,,,146,,,,150,,152,,,,,157,,,160,,162,,,,,,,,,,,173,174,,176,,178,,,,,,,,,,,"),
(",,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,,,,,,174,,176,,,,,,,,,,,,,                                       "),
(",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,                                                               "),
(",,,103,104,,,,,,,111,,,114,,,,,,,121,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,,                              "),
(",,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,,                                    "),
(",,102,,104,,,,,,,,,113,114,,,,,,,,,,,,,,,,130,131,,,,,,,,,,141,142,143,,,146,,,,150,,152,,,,,157,,,,,162,,,,,,,,,,,,174,,,,,,,,,,,,,,,                  ")
).toDF("my_col")

我想删除此列中不必要的逗号。例如,第一条记录应该显示为 104,111,114,131,157,162,169,174,176 在输出中。
字符串不必以逗号开头和结尾。
在spark我该怎么做?

ncgqoxb0

ncgqoxb01#

您可以使用regexp\u替换:

val df_cleaned = df.withColumn("cleaned", regexp_replace(col("my_col"), ",+", ","))
  .withColumn("cleaned", regexp_replace(col("cleaned"), "^,", ""))
  .withColumn("cleaned", regexp_replace(col("cleaned"), ",$", ""))

第一行删除所有重复的逗号,第二行和第三行删除前导和尾随的逗号。

sycxhyv7

sycxhyv72#

首先,用delim“,”拆分字符串。然后使用array\u remove函数删除空字符串。将数组连接回字符串。后面有一个“,”。若要删除该字符,请使用udf删除字符串中最右边的字符。

scala> df.show(false)
+--------------------------------------------------------------------------------------------------------------------------------------------------------+
|my_col                                                                                                                                                  |
+--------------------------------------------------------------------------------------------------------------------------------------------------------+
|,,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,,                                    |
|,,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,,                                    |
|,,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,,                                    |
|,,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,160,,162,,,,,,,,,,,,174,,176,,,,,,,,,,,,,                                    |
|,,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,,                                    |
|,,,,104,,,,,,,111,,,,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,160,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,,                                    |
|,,,103,104,,,,,,,111,,,114,,,,,,,121,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,,                              |
|,101,102,,104,,,,,,,,,113,114,,,,,,,,,,,,,,,,130,131,,,,,,,,,,141,142,143,,,146,,,,150,,152,,,,,157,,,,,162,,,,,,,,,,,,174,,,,,,,,,,184,,,,,            |
|,,,,104,,,,,,,,,113,,,,,,,,,,,,,,,,,,131,,,,,,,,,,141,142,143,,,146,,,,150,,,,,155,,157,,,,,162,,,,,,,169,,,,,174,,176,177,178,,,,,,,,,,,               |
|,,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,,,,,,174,,176,,,,,,,,,,,,,                                       |
|,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,                                                               |
|,,102,,104,,,,,,,,,113,114,,,,,,,,,,,,,,,,130,131,,,,,,,,,,141,142,143,,,146,,,,150,,152,,,,,157,,,,,162,,,,,,,,,,,,174,,,,,,,,,,,,,,,                  |
|,,,,104,,,,,,,111,112,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,,                                 |
|,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,                                                               |
|,,102,103,104,,,,,,,,,113,114,,,,,,,121,,,,,,,,,130,131,,,,,,,,,,141,142,143,,,146,,,,150,,152,,,,,157,,,160,,162,,,,,,,,,,,173,174,,176,,178,,,,,,,,,,,|
|,,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,,,,,,174,,176,,,,,,,,,,,,,                                       |
|,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,                                                               |
|,,,103,104,,,,,,,111,,,114,,,,,,,121,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,,                              |
|,,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,,                                    |
|,,102,,104,,,,,,,,,113,114,,,,,,,,,,,,,,,,130,131,,,,,,,,,,141,142,143,,,146,,,,150,,152,,,,,157,,,,,162,,,,,,,,,,,,174,,,,,,,,,,,,,,,                  |
+--------------------------------------------------------------------------------------------------------------------------------------------------------+

scala> df.select(trim(array_join(array_remove(split($"my_col", ","), ""),",")) as "my_col").show(false)
+-----------------------------------------------------------------------------------+
|my_col                                                                             |
+-----------------------------------------------------------------------------------+
|104,111,114,131,157,162,169,174,176,                                               |
|104,111,114,131,157,162,169,174,176,                                               |
|104,111,114,131,157,162,169,174,176,                                               |
|104,111,114,131,157,160,162,174,176,                                               |
|104,111,114,131,157,162,169,174,176,                                               |
|104,111,131,157,160,162,169,174,176,                                               |
|103,104,111,114,121,131,157,162,169,174,176,                                       |
|101,102,104,113,114,130,131,141,142,143,146,150,152,157,162,174,184,               |
|104,113,131,141,142,143,146,150,155,157,162,169,174,176,177,178,                   |
|104,111,114,131,157,162,174,176,                                                   |
|                                                                                   |
|102,104,113,114,130,131,141,142,143,146,150,152,157,162,174,                       |
|104,111,112,114,131,157,162,169,174,176,                                           |
|                                                                                   |
|102,103,104,113,114,121,130,131,141,142,143,146,150,152,157,160,162,173,174,176,178|
|104,111,114,131,157,162,174,176,                                                   |
|                                                                                   |
|103,104,111,114,121,131,157,162,169,174,176,                                       |
|104,111,114,131,157,162,169,174,176,                                               |
|102,104,113,114,130,131,141,142,143,146,150,152,157,162,174,                       |
+-----------------------------------------------------------------------------------+

scala> val myUdf = udf{(x:String) => if(x.endsWith(",")){x.dropRight(1)} else {x}}
myUdf: org.apache.spark.sql.expressions.UserDefinedFunction = UserDefinedFunction(<function1>,StringType,Some(List(StringType)))

scala> df.select(myUdf(trim(array_join(array_remove(split($"my_col", ","), ""),","))) as "my_col").show(false)
+-----------------------------------------------------------------------------------+
|my_col                                                                             |
+-----------------------------------------------------------------------------------+
|104,111,114,131,157,162,169,174,176                                                |
|104,111,114,131,157,162,169,174,176                                                |
|104,111,114,131,157,162,169,174,176                                                |
|104,111,114,131,157,160,162,174,176                                                |
|104,111,114,131,157,162,169,174,176                                                |
|104,111,131,157,160,162,169,174,176                                                |
|103,104,111,114,121,131,157,162,169,174,176                                        |
|101,102,104,113,114,130,131,141,142,143,146,150,152,157,162,174,184                |
|104,113,131,141,142,143,146,150,155,157,162,169,174,176,177,178                    |
|104,111,114,131,157,162,174,176                                                    |
|                                                                                   |
|102,104,113,114,130,131,141,142,143,146,150,152,157,162,174                        |
|104,111,112,114,131,157,162,169,174,176                                            |
|                                                                                   |
|102,103,104,113,114,121,130,131,141,142,143,146,150,152,157,160,162,173,174,176,178|
|104,111,114,131,157,162,174,176                                                    |
|                                                                                   |
|103,104,111,114,121,131,157,162,169,174,176                                        |
|104,111,114,131,157,162,169,174,176                                                |
|102,104,113,114,130,131,141,142,143,146,150,152,157,162,174                        |
+-----------------------------------------------------------------------------------+

相关问题