flink版本:1.2.0
scala版本:2.11.8
我想用一个数据流来预测使用scala在flink中的一个模型。我在flink中有一个使用scala的datastream[string],其中包含来自kafka源的json格式的数据。我想使用这个datastream在已经训练过的flink ml模型上进行预测。问题是所有的flink-ml示例都使用datasetapi来预测。我对flink和scala还比较陌生,所以任何代码解决方案形式的帮助都将不胜感激。
输入:
{"FC196":"Dormant","FC174":"Yolo","FC195":"Lol","FC176":"4","FC198":"BANKING","FC175":"ABDULMAJEED","FC197":"2017/04/04","FC178":"1","FC177":"CBS","FC199":"INDIVIDUAL","FC179":"SYSTEM","FC190":"OK","FC192":"osName","FC191":"Completed","FC194":"125","FC193":"7","FC203":"A10SBPUB000000000004439900053570","FC205":"1","FC185":"20","FC184":"Transfer","FC187":"2","FC186":"2121","FC189":"abcdef","FC200":"","FC188":"BR01","FC202":"INDIVIDUAL","FC201":"","FC181":"7:00PM","FC180":"2007/04/01","FC183":"11000000","FC182":"INR"}
代码:
package org.apache.flink.quickstart
//imports
import java.util.Properties
import org.apache.flink.api.scala._
import org.apache.flink.ml.recommendation.ALS
import org.apache.flink.ml.regression.MultipleLinearRegression
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import scala.util.parsing.json.JSON
//kafka consumer imports
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer09
import org.apache.flink.streaming.util.serialization.SimpleStringSchema
//kafka json table imports
import org.apache.flink.table.examples.scala.StreamTableExample
import org.apache.flink.table.api.TableEnvironment
import org.apache.flink.streaming.connectors.kafka.Kafka09JsonTableSource
import org.apache.flink.api.java.DataSet
//JSon4s imports
import org.json4s.native.JsonMethods
// Case class
case class CC(FC196:String,FC174:String,FC195:String,FC176:String,FC198:String,FC175:String,FC197:String,FC178:String,FC177:String,FC199:String,FC179:String,FC190:String,FC192:String,FC191:String,FC194:String,FC193:String,FC203:String,FC205:String,FC185:String,FC184:String,FC187:String,FC186:String,FC189:String,FC200:String,FC188:String,FC202:String,FC201:String,FC181:String,FC180:String,FC183:String,FC182:String)
object WordCount {
implicit val formats = org.json4s.DefaultFormats
def main(args: Array[String]) {
// set up the execution environment
implicit lazy val formats = org.json4s.DefaultFormats
// kafka properties
val properties = new Properties()
properties.setProperty("bootstrap.servers", "***.**.*.***:9093")
properties.setProperty("zookeeper.connect", "***.**.*.***:2181")
properties.setProperty("group.id","grouop")
properties.setProperty("auto.offset.reset", "earliest")
val env = StreamExecutionEnvironment.getExecutionEnvironment
// val tableEnv = TableEnvironment.getTableEnvironment(env)
val st = env
.addSource(new FlinkKafkaConsumer09("new", new SimpleStringSchema() , properties))
.flatMap(raw => JsonMethods.parse(raw).toOption)
val mapped = st.map(_.extract[CC])
mapped.print()
env.execute()
}
}
1条答案
按热度按时间x33g5p2x1#
解决这个问题的方法是写一个
MapFunction
在作业开始时读取模型。这个MapFunction
然后将模型存储为其内部状态的一部分。这样,在出现故障时将自动恢复: