flatmap从列值到多行缺少模式

3xiyfsfu  于 2021-07-13  发布在  Spark
关注(0)|答案(2)|浏览(250)

我正在尝试基于列值将以下Dataframe转换为多行。我认为行缺少模式(第128行)并引发异常。
原始Dataframe

+---+------------------------+
|Id |Set                     |
+---+------------------------+
|1  |AA001-AA003, BB002-BB003|
|2  |AA045-AA046, CC099-CC100|
+---+------------------------+

澄清用中间Dataframe步骤

+---+-----------+
| Id|        Set|
+---+-----------+
|  1|AA001-AA003|
|  1|BB002-BB003|
|  2|AA045-AA046|
|  2|CC099-CC100|
+---+-----------+

最终Dataframe

+---+-------+------+------+
| Id|Combine|Letter|Number|
+---+-------+------+------+
|  1|  AA001|    AA|     1|
|  1|  AA002|    AA|     2|
|  1|  AA003|    AA|     3|
|  1|  BB002|    BB|     2|
|  1|  BB003|    BB|     3|
|  2|  AA045|    AA|    45|
|  2|  AA046|    AA|    46|
|  2|  CC099|    CC|    99|
|  2|  CC100|    CC|   100|
+---+-------+------+------+

这就是我得到的例外:

示例应用程序

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;

import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.catalyst.encoders.RowEncoder;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

import scala.Tuple2;

public class SampleApp implements Serializable {
    private static final long serialVersionUID = -1L;

    private static String ID = "Id";
    private static String SET = "Set";  
    private static String COMBINE = "Combine";
    private static String LETTER = "Letter";
    private static String NUMBER = "Number";

    public static void main(String[] args) {
        SampleApp app = new SampleApp();
        app.start();
    }

    private void start() {

        Logger.getLogger("org.apache").setLevel(Level.WARN);

        SparkSession spark = SparkSession
                .builder()
                .appName("Spark App")
                .master("local[*]")
                .getOrCreate();

        StructType commaStructType = new StructType();
        commaStructType = commaStructType.add(ID, DataTypes.IntegerType, false);
        commaStructType = commaStructType.add(SET, DataTypes.StringType, true);

        StructType resultStructType = new StructType();
        resultStructType = resultStructType.add(ID, DataTypes.IntegerType, false);
        resultStructType = resultStructType.add(COMBINE, DataTypes.StringType, false);
        resultStructType = resultStructType.add(LETTER, DataTypes.StringType, false);
        resultStructType = resultStructType.add(NUMBER, DataTypes.IntegerType, false);

        List<Row> list = new ArrayList<Row>();
        list.add(RowFactory.create(1, "AA001-AA003, BB002-BB003"));
        list.add(RowFactory.create(2, "AA045-AA046, CC099-CC100"));

        Dataset<Row> df = spark.createDataFrame(list, commaStructType);
        df.show(10, false);
        df.printSchema();

        Dataset<Row> commaSeparatedDf = df.flatMap(new separateByCommaFlatMap(), RowEncoder.apply(commaStructType));
        commaSeparatedDf.show(10, true);
        commaSeparatedDf.printSchema();

        Dataset<Row> resultDf = commaSeparatedDf.flatMap(new separateByDashFlatMap(), RowEncoder.apply(resultStructType));
        resultDf.show(10, true);
        resultDf.printSchema();

        /* This manually created DataFrame for the final step works */
        /*List<Row> list2 = new ArrayList<Row>();
        list2.add(RowFactory.create(1, "AA001-AA003"));
        list2.add(RowFactory.create(1, "BB002-BB003"));
        list2.add(RowFactory.create(2, "AA045-AA046"));
        list2.add(RowFactory.create(2, "CC099-CC100"));

        Dataset<Row> df2 = spark.createDataFrame(list2, commaStructType);
        df2.show(10, true);
        df2.printSchema();

        Dataset<Row> resultDf2 = df2.flatMap(new separateByDashFlatMap(), RowEncoder.apply(resultStructType));
        resultDf2.show(10, true);
        resultDf2.printSchema();*/
    }

    /*
     * Split "AA001-AA003, BB002-BB003" into individual row
     *   AA001-AA003
     *   BB002-BB003
     */
    private final class separateByCommaFlatMap implements FlatMapFunction<Row, Row> {
        private static final long serialVersionUID = 63784L;

        @Override
        public Iterator<Row> call(Row r) throws Exception {
                int id = Integer.parseInt(r.getAs(ID).toString());
                String[] s = r.getAs(SET).toString().split(", ");

                List<Row> list = new ArrayList<Row>();
                for (int i = 0; i < s.length; i++) {
                    List<Object> data = new ArrayList<>();
                    data.add(id);
                    data.add(s[i]);
                    list.add(RowFactory.create(data.toArray()));
                }

                return list.iterator();
            }
        }   

    /*
     * Split "AA001-AA003" into individual row
     *   AA001 | AA | 1
     *   AA002 | AA | 2
     *   AA003 | AA | 3
     */
    private final class separateByDashFlatMap implements FlatMapFunction<Row, Row> {
        private static final long serialVersionUID = 63784L;

        @Override
        public Iterator<Row> call(Row r) throws Exception {
                int id = r.getAs(ID);   
                String[] s = r.getAs(SET).toString().split("-");                
                String letter = s[0].substring(0, 2);

                int start = Integer.parseInt(s[0].substring(2, s[0].length()));
                int end = Integer.parseInt(s[1].substring(2, s[1].length()));

                List<Row> list = new ArrayList<Row>();
                for(int i = start; i <= end; i++) {
                    List<Object> data = new ArrayList<>();
                    data.add(id);
                    data.add(String.format("%s%03d", letter, i));
                    data.add(letter);
                    data.add(i);
                    list.add(RowFactory.create(data.toArray()));                                        
                }

                return list.iterator();
            }
        }
}
sigwle7e

sigwle7e1#

下面是一个基于dataframe api的解决方案:

import org.apache.spark.sql.functions._

Dataset<Row> result = df.withColumn(
    "Set",
    explode(split(col("Set"), ", "))    // split by comma and explode into rows
).withColumn(
    "Letter",
    substring(col("Set"), 1, 2)    // get letter from first two chars
).withColumn(
    "Number",    // get and explode a list of numbers using Spark SQL sequence function
    expr("""
        explode(sequence(
            int(substring(split(Set, '-')[0], 3)),
            int(substring(split(Set, '-')[1], 3))
        ))
    """)
).withColumn(
    "Combine",    // get formatted string for combine column
     format_string("%s%03d", col("Letter"), col("Number"))
).select(
    "ID", "Combine", "Letter", "Number"
)

result.show()
+---+-------+------+------+
| ID|Combine|Letter|Number|
+---+-------+------+------+
|  1|  AA001|    AA|     1|
|  1|  AA002|    AA|     2|
|  1|  AA003|    AA|     3|
|  1|  BB002|    BB|     2|
|  1|  BB003|    BB|     3|
|  2|  AA045|    AA|    45|
|  2|  AA046|    AA|    46|
|  2|  CC099|    CC|    99|
|  2|  CC100|    CC|   100|
+---+-------+------+------+
mxg2im7a

mxg2im7a2#

如果有人想要java代码,我还添加了我的答案。特别感谢麦克!

import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;

import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructType;

import static org.apache.spark.sql.functions.explode;
import static org.apache.spark.sql.functions.split;
import static org.apache.spark.sql.functions.substring;
import static org.apache.spark.sql.functions.sequence;
import static org.apache.spark.sql.functions.format_string;
import static org.apache.spark.sql.functions.col;

public class ExplodeApp implements Serializable {
    private static final long serialVersionUID = -1L;

    private static String ID = "Id";
    private static String SET = "Set";  
    private static String COMBINE = "Combine";
    private static String LETTER = "Letter";
    private static String NUMBER = "Number";
    private static String RANGE = "Range";

    public static void main(String[] args) {
        ExplodeApp app = new ExplodeApp();
        app.start();
    }

    private void start() {

        Logger.getLogger("org.apache").setLevel(Level.WARN);

        SparkSession spark = SparkSession
                .builder()
                .appName("Spark App")
                .master("local[*]")
                .getOrCreate();

        StructType commaStructType = new StructType();
        commaStructType = commaStructType.add(ID, DataTypes.IntegerType, false);
        commaStructType = commaStructType.add(SET, DataTypes.StringType, true);

        List<Row> list = new ArrayList<Row>();
        list.add(RowFactory.create(1, "AA001-AA003, BB002-BB003"));
        list.add(RowFactory.create(2, "AA045-AA046, CC099-CC100"));     

        Dataset<Row> df = spark.createDataFrame(list, commaStructType);
        df.show(10, false);     

        Column[] columnNames =  new Column[] { col(ID), col(COMBINE), col(LETTER), col(NUMBER) };

        Dataset<Row> resultDf = df
                .withColumn(RANGE, explode(split(df.col(SET), ", ")))               
                .withColumn(LETTER, substring(col(RANGE), 1, 2))
                .withColumn(NUMBER, explode(
                        sequence(substring(split(col(RANGE), "-").getItem(0), 3, 3).cast(DataTypes.IntegerType), 
                                substring(split(col(RANGE), "-").getItem(1), 3, 3).cast(DataTypes.IntegerType))))
                .withColumn(COMBINE, format_string("%s%03d", col(LETTER), col(NUMBER)))
                .select(columnNames);

        resultDf.show(10, false);       
    }   
}

相关问题