在这里,我从流目录流式传输数据并将其写入输出位置。我还尝试实现将hdfs文件从输入文件夹移动到流目录的过程。此移动在流上下文开始之前发生一次。但我希望每次对每一批数据流执行这个动作。这有可能吗?
val streamed_rdd = ssc.fileStream[LongWritable, Text, TextInputFormat](streaming_directory, (t:Path)=> true , true).map { case (x, y) => (y.toString) }
streamed_rdd.foreachRDD( rdd => {
rdd.map(x =>x.split("\t")).map(x => x(3)).foreachPartition { partitionOfRecords =>
val connection: Connection = connectionFactory.createConnection()
connection.setClientID("Email_send_module_client_id")
println("connection started with active mq")
val session: Session = connection.createSession(false, Session.AUTO_ACKNOWLEDGE)
println("created session")
val dest = session.createQueue("dwEmailsQueue2")
println("destination queue name = dwEmailsQueue2")
val prod_queue = session.createProducer(dest)
connection.start()
partitionOfRecords.foreach { record =>
val rec_to_send: TextMessage = session.createTextMessage(record)
println("started creating a text message")
prod_queue.send(rec_to_send)
println("sent the record")
}
connection.close()
}
}
)
**val LIST = scala.collection.mutable.MutableList[String]()
val files_to_move = scala.collection.mutable.MutableList[String]()
val cmd = "hdfs dfs -ls -d "+load_directory+"/*"
println(cmd)
val system_time = System.currentTimeMillis
println(system_time)
val output = cmd.!!
output.split("\n").foreach(x => x.split(" ").foreach(x => if (x.startsWith("/user/hdpprod/")) LIST += x))
LIST.foreach(x => if (x.toString.split("/").last.split("_").last.toLong < system_time) files_to_move += x)
println("files to move" +files_to_move)
var mv_cmd :String = "hdfs dfs -mv "
for (file <- files_to_move){
mv_cmd += file+" "
}
mv_cmd += streaming_directory
println(mv_cmd)
val mv_output = mv_cmd.!!
println("moved the data to the folder")**
if (streamed_rdd.count().toString == "0") {
println("no data in the streamed list")
} else {
println("saving the Dstream at "+System.currentTimeMillis())
streamed_rdd.transform(rdd => {rdd.map(x => (check_time_to_send+"\t"+check_time_to_send_utc+"\t"+x))}).saveAsTextFiles("/user/hdpprod/temp/spark_streaming_output_sent/sent")
}
ssc.start()
ssc.awaitTermination()
}
}
1条答案
按热度按时间ycggw6v21#
我试着在java实现中做同样的事情,如下所示。您可以从rdd上的foreachpartion调用此方法