我正在尝试基于列值将以下Dataframe转换为多行。我认为行缺少模式(第128行)并引发异常。
原始Dataframe
+---+------------------------+
|Id |Set |
+---+------------------------+
|1 |AA001-AA003, BB002-BB003|
|2 |AA045-AA046, CC099-CC100|
+---+------------------------+
澄清用中间Dataframe步骤
+---+-----------+
| Id| Set|
+---+-----------+
| 1|AA001-AA003|
| 1|BB002-BB003|
| 2|AA045-AA046|
| 2|CC099-CC100|
+---+-----------+
最终Dataframe
+---+-------+------+------+
| Id|Combine|Letter|Number|
+---+-------+------+------+
| 1| AA001| AA| 1|
| 1| AA002| AA| 2|
| 1| AA003| AA| 3|
| 1| BB002| BB| 2|
| 1| BB003| BB| 3|
| 2| AA045| AA| 45|
| 2| AA046| AA| 46|
| 2| CC099| CC| 99|
| 2| CC100| CC| 100|
+---+-------+------+------+
这就是我得到的例外:
示例应用程序
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.catalyst.encoders.RowEncoder;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import scala.Tuple2;
public class SampleApp implements Serializable {
private static final long serialVersionUID = -1L;
private static String ID = "Id";
private static String SET = "Set";
private static String COMBINE = "Combine";
private static String LETTER = "Letter";
private static String NUMBER = "Number";
public static void main(String[] args) {
SampleApp app = new SampleApp();
app.start();
}
private void start() {
Logger.getLogger("org.apache").setLevel(Level.WARN);
SparkSession spark = SparkSession
.builder()
.appName("Spark App")
.master("local[*]")
.getOrCreate();
StructType commaStructType = new StructType();
commaStructType = commaStructType.add(ID, DataTypes.IntegerType, false);
commaStructType = commaStructType.add(SET, DataTypes.StringType, true);
StructType resultStructType = new StructType();
resultStructType = resultStructType.add(ID, DataTypes.IntegerType, false);
resultStructType = resultStructType.add(COMBINE, DataTypes.StringType, false);
resultStructType = resultStructType.add(LETTER, DataTypes.StringType, false);
resultStructType = resultStructType.add(NUMBER, DataTypes.IntegerType, false);
List<Row> list = new ArrayList<Row>();
list.add(RowFactory.create(1, "AA001-AA003, BB002-BB003"));
list.add(RowFactory.create(2, "AA045-AA046, CC099-CC100"));
Dataset<Row> df = spark.createDataFrame(list, commaStructType);
df.show(10, false);
df.printSchema();
Dataset<Row> commaSeparatedDf = df.flatMap(new separateByCommaFlatMap(), RowEncoder.apply(commaStructType));
commaSeparatedDf.show(10, true);
commaSeparatedDf.printSchema();
Dataset<Row> resultDf = commaSeparatedDf.flatMap(new separateByDashFlatMap(), RowEncoder.apply(resultStructType));
resultDf.show(10, true);
resultDf.printSchema();
/* This manually created DataFrame for the final step works */
/*List<Row> list2 = new ArrayList<Row>();
list2.add(RowFactory.create(1, "AA001-AA003"));
list2.add(RowFactory.create(1, "BB002-BB003"));
list2.add(RowFactory.create(2, "AA045-AA046"));
list2.add(RowFactory.create(2, "CC099-CC100"));
Dataset<Row> df2 = spark.createDataFrame(list2, commaStructType);
df2.show(10, true);
df2.printSchema();
Dataset<Row> resultDf2 = df2.flatMap(new separateByDashFlatMap(), RowEncoder.apply(resultStructType));
resultDf2.show(10, true);
resultDf2.printSchema();*/
}
/*
* Split "AA001-AA003, BB002-BB003" into individual row
* AA001-AA003
* BB002-BB003
*/
private final class separateByCommaFlatMap implements FlatMapFunction<Row, Row> {
private static final long serialVersionUID = 63784L;
@Override
public Iterator<Row> call(Row r) throws Exception {
int id = Integer.parseInt(r.getAs(ID).toString());
String[] s = r.getAs(SET).toString().split(", ");
List<Row> list = new ArrayList<Row>();
for (int i = 0; i < s.length; i++) {
List<Object> data = new ArrayList<>();
data.add(id);
data.add(s[i]);
list.add(RowFactory.create(data.toArray()));
}
return list.iterator();
}
}
/*
* Split "AA001-AA003" into individual row
* AA001 | AA | 1
* AA002 | AA | 2
* AA003 | AA | 3
*/
private final class separateByDashFlatMap implements FlatMapFunction<Row, Row> {
private static final long serialVersionUID = 63784L;
@Override
public Iterator<Row> call(Row r) throws Exception {
int id = r.getAs(ID);
String[] s = r.getAs(SET).toString().split("-");
String letter = s[0].substring(0, 2);
int start = Integer.parseInt(s[0].substring(2, s[0].length()));
int end = Integer.parseInt(s[1].substring(2, s[1].length()));
List<Row> list = new ArrayList<Row>();
for(int i = start; i <= end; i++) {
List<Object> data = new ArrayList<>();
data.add(id);
data.add(String.format("%s%03d", letter, i));
data.add(letter);
data.add(i);
list.add(RowFactory.create(data.toArray()));
}
return list.iterator();
}
}
}
2条答案
按热度按时间sigwle7e1#
下面是一个基于dataframe api的解决方案:
mxg2im7a2#
如果有人想要java代码,我还添加了我的答案。特别感谢麦克!