我正在尝试使用spark解析xml文件中的数据 databrics
图书馆
这是我的密码:
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.functions
import java.text.Format
import org.apache.spark.sql.functions.concat_ws
import org.apache.spark.sql
import org.apache.spark.sql.types._
import org.apache.spark.sql.catalyst.plans.logical.With
import org.apache.spark.sql.functions.lit
import org.apache.spark.sql.functions.udf
import scala.sys.process._
import org.apache.spark.sql.functions.lit
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.functions._
object printschema
{
def main(args: Array[String]): Unit =
{
val conf = new SparkConf().setAppName("printschema").setMaster("local")
conf.set("spark.debug.maxToStringFields", "10000000")
val context = new SparkContext(conf)
val sqlCotext = new SQLContext(context)
import sqlCotext.implicits._
val df = sqlCotext.read.format("com.databricks.spark.xml")
.option("rowTag", "us-bibliographic-data-application")
.option("treatEmptyValuesAsNulls", true)
.load("/Users/praveen/Desktop/ipa0105.xml")
val q1= df.withColumn("document",$"application-reference.document-id.doc-number".cast(sql.types.StringType))
.withColumn("document_number",$"application-reference.document-id.doc-number".cast(sql.types.StringType)).select("document","document_number").collect()
for(l<-q1)
{
val m1=l.get(0)
val m2=l.get(1)
println(m1,m2)
}
}
}
当我在scalaide/intellij idea上运行代码时,它运行得很好,下面是我的输出。
(14789882,14789882)
(14755945,14755945)
(14755919,14755919)
但是,当我构建一个jar并使用 spark-submit
它只返回空值
输出:
NULL,NULL
NULL,NULL
NULL,NULL
以下是我的spark提交: ./spark-submit --jars /home/hadoop/spark-xml_2.11-0.4.0.jar --class inndata.praveen --master local[2] /home/hadoop/ip/target/scala-2.11/ip_2.11-1.0.jar
暂无答案!
目前还没有任何答案,快来回答吧!