spark流序列化错误

svmlkihl  于 2021-05-29  发布在  Hadoop
关注(0)|答案(1)|浏览(452)

我在spark流应用程序中遇到序列化错误。下面是我的驱动程序代码:

package com.test
import org.apache.spark._
import org.apache.spark.streaming._
import org.json.JSONObject;
import java.io.Serializable

object SparkFiller extends Serializable{
 def main(args: Array[String]): Unit ={
 val sparkConf = new 
SparkConf().setAppName("SparkFiller").setMaster("local[*]")
// println("test")
sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
sparkConf.registerKryoClasses(Array(classOf[firehoseToDocumentDB]))
sparkConf.registerKryoClasses(Array(classOf[PushToDocumentDB]))
var TimeStamp_Start = 1493836050
val TimeStamp_Final = 1493836056
var timeStamp_temp = TimeStamp_Start - 5;
// val send_timestamps = new firehoseToDocumentDB(TimeStamp_Start,TimeStamp_Final);
// send_timestamps.onStart();
val ssc = new StreamingContext(sparkConf, Seconds(5))
val lines = ssc.receiverStream(
new firehoseToDocumentDB(TimeStamp_Start.toString(),TimeStamp_Final.toString()))
// val timestamp_stream = ssc.receiverStream(new firehoseToDocumentDB(TimeStamp_Start.toString(),TimeStamp_Final.toString()))
lines.foreachRDD(rdd => {
  rdd.foreachPartition(part => {
    val dbsender = new PushToDocumentDB();
    part.foreach(msg =>{
      var jsonobject = new JSONObject(part)
      var temp_pitr = jsonobject.getString("pitr")
      println(temp_pitr)
      if ( TimeStamp_Final >= temp_pitr.toLong) {
        ssc.stop()
      }
      dbsender.PushFirehoseMessagesToDocumentDb(msg)
    })
    // dbsender.close()
  })
})

println("line",line)))
println("ankush")
ssc.start()
ssc.awaitTermination()
 }

}

当我在代码中添加以下行时

var jsonobject = new JSONObject(part)
      var temp_pitr = jsonobject.getString("pitr")
      println(temp_pitr)
      if ( TimeStamp_Final >= temp_pitr.toLong) {
        ssc.stop()
      }

我得到一个错误:

Exception in thread "main" org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:304)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:294)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:122)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2055)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:919)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:918)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
at org.apache.spark.rdd.RDD.foreachPartition(RDD.scala:918)
at com.boeing.SparkFiller$$anonfun$main$1.apply(SparkFiller.scala:26)
at com.boeing.SparkFiller$$anonfun$main$1.apply(SparkFiller.scala:25)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:661)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:661)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:426)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:49)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:49)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:49)
at scala.util.Try$.apply(Try.scala:161)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:224)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:224)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:224)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:223)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
 Caused by: java.io.NotSerializableException: org.apache.spark.streaming.StreamingContext
 Serialization stack:
- object not serializable (class: 
org.apache.spark.streaming.StreamingContext, value: 
org.apache.spark.streaming.StreamingContext@780e1bb5)
- field (class: com.boeing.SparkFiller$$anonfun$main$1, name: ssc$1, type: 
class org.apache.spark.streaming.StreamingContext)
- object (class com.boeing.SparkFiller$$anonfun$main$1, <function1>)
- field (class: com.boeing.SparkFiller$$anonfun$main$1$$anonfun$apply$1, 
name: $outer, type: class com.boeing.SparkFiller$$anonfun$main$1)
- object (class com.boeing.SparkFiller$$anonfun$main$1$$anonfun$apply$1, <function1>)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:47)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:101)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:301)
... 30 more

Process finished with exit code 1

如果我删除这些代码行,它是工作良好。
问题是因为在rdd中使用了ssc.stop()。如果rdd满足条件,是否有其他方法可以从rdd调用关闭挂钩。

vfwfrxfs

vfwfrxfs1#

问题是因为在rdd中使用了ssc.stop()。
你是对的!任何spark上下文都是不可序列化的,不能在任何任务中使用。
如果rdd满足条件,是否有其他方法可以从rdd调用关闭挂钩。
为了控制流应用程序的生命周期,您应该考虑重写侦听器并根据您的条件停止上下文。我做了足够的研究,发现这是唯一可行的解决办法。
请参考我对这篇文章的回答,了解如何根据特定条件停止流媒体应用程序。

相关问题