无法读取java中sparksql中的文件

wvmv3b1j  于 2021-05-29  发布在  Hadoop
关注(0)|答案(1)|浏览(412)

请帮帮我,我是一个彻底的noob当谈到Spark和haddop一般。我的代码如下所示:

public static void main(String[] args) throws IOException {

    String[] jars = {"D:\\customJars\\sparky.jar","D:\\customJars\\guava-19.0.jar"};
      System.setProperty("hadoop.home.dir", "D:\\hadoop-common-2.2.0-bin-master");
        SparkConf sparkConf = new SparkConf().setAppName("com.nucleus.spark.MlibPOC")
                .setMaster("spark://10.1.50.165:7077")
                .setJars(jars);

    JavaSparkContext jsc = new JavaSparkContext(sparkConf);
    SQLContext sqlContext = new SQLContext(jsc);

   DataFrame df = sqlContext.read().json("src/com/nucleus/spark/generated.json");

}

我的spark cluster在10.1.50.165中部署为一个单独的集群,运行rhel6,当我运行这段简单的代码时,在尝试读取json文件时,会出现如下异常:
线程“main”org.apache.spark.sparkexception中出现异常:由于阶段失败而中止作业:阶段0.0中的任务1失败了4次,最近的失败:阶段0.0中的任务1.3丢失(tid 5,10.1.50.165):java.io.filenotfoundexception:文件file:/d:/workspace2/sparkhadoopproject/src/com/nucleus/spark/generated.json在org.apache.hadoop.fs.rawlocalfilesystem.deprecatedgetfilestatus(rawlocalfilesystem)中不存在。java:534)在org.apache.hadoop.fs.rawlocalfilesystem.getfilelinkstatusinternal(rawlocalfilesystem)上。java:747)在org.apache.hadoop.fs.rawlocalfilesystem.getfilestatus(rawlocalfilesystem)。java:524)在org.apache.hadoop.fs.filterfilesystem.getfilestatus(filterfilesystem。java:409)在org.apache.hadoop.fs.checksumfisystem$checksumfsinputchecker.(checksumfisystem。java:140)在org.apache.hadoop.fs.checksumfilesystem.open(checksumfilesystem。java:341)在org.apache.hadoop.fs.filesystem.open(文件系统)。java:766)在org.apache.hadoop.mapred.linerecordreader。java:108)在org.apache.hadoop.mapred.textinputformat.getrecordreader(textinputformat。java:67)在org.apache.spark.rdd.hadooprdd$$anon$1。scala:237)在org.apache.spark.rdd.hadooprdd.compute(hadooprdd。scala:208)在org.apache.spark.rdd.hadooprdd.compute(hadooprdd。scala:101)在org.apache.spark.rdd.rdd.computeorreadcheckpoint(rdd。scala:306)在org.apache.spark.rdd.rdd.iterator(rdd。scala:270)在org.apache.spark.rdd.mappartitionsrdd.compute(mappartitionsrdd。scala:38)在org.apache.spark.rdd.rdd.computeorreadcheckpoint(rdd。scala:306)在org.apache.spark.rdd.rdd.iterator(rdd。scala:270)在org.apache.spark.rdd.mappartitionsrdd.compute(mappartitionsrdd。scala:38)在org.apache.spark.rdd.rdd.computeorreadcheckpoint(rdd。scala:306)在org.apache.spark.rdd.rdd.iterator(rdd。scala:270)在org.apache.spark.rdd.mappartitionsrdd.compute(mappartitionsrdd。scala:38)在org.apache.spark.rdd.rdd.computeorreadcheckpoint(rdd。scala:306)在org.apache.spark.rdd.rdd.iterator(rdd。scala:270)在org.apache.spark.scheduler.resulttask.runtask(resulttask。scala:66)在org.apache.spark.scheduler.task.run(task。scala:89)在org.apache.spark.executor.executor$taskrunner.run(executor。scala:227)在java.util.concurrent.threadpoolexecutor.runworker(threadpoolexecutor。java:1142)在java.util.concurrent.threadpoolexecutor$worker.run(threadpoolexecutor。java:617)在java.lang.thread.run(线程。java:745)
驱动程序stacktrace:org.apache.spark.scheduler.dagscheduler.org$apache$spark$scheduler$dagscheduler$$failjobandindependentstages(dagscheduler)。scala:1431)在org.apache.spark.scheduler.dagscheduler$$anonfun$abortstage$1.apply(dagscheduler。scala:1419)在org.apache.spark.scheduler.dagscheduler$$anonfun$abortstage$1.apply(dagscheduler。scala:1418) 在scala.collection.mutable.resizablearray$class.foreach(resizablearray。scala:59)在scala.collection.mutable.arraybuffer.foreach(arraybuffer。scala:47)在org.apache.spark.scheduler.dagscheduler.abortstage(dagscheduler。scala:1418)位于org.apache.spark.scheduler.dagscheduler$$anonfun$handletasksetfailed$1.apply(dagscheduler)。scala:799)在org.apache.spark.scheduler.dagscheduler$$anonfun$handletasksetfailed$1.apply(dagscheduler)。scala:799)在scala.option.foreach(option。scala:236)在org.apache.spark.scheduler.dagscheduler.handletasksetfailed(dagscheduler。scala:799)位于org.apache.spark.scheduler.dagschedulereventprocessloop.doonreceive(dagscheduler。scala:1640)在org.apache.spark.scheduler.dagschedulereventprocessloop.onreceive(dagscheduler。scala:1599)位于org.apache.spark.scheduler.dagschedulereventprocessloop.onreceive(dagscheduler。scala:1588)在org.apache.spark.util.eventloop$$anon$1.run(eventloop。scala:48)在org.apache.spark.scheduler.dagscheduler.runjob(dagscheduler。scala:620)在org.apache.spark.sparkcontext.runjob(sparkcontext。scala:1832)在org.apache.spark.sparkcontext.runjob(sparkcontext。scala:1952)在org.apache.spark.rdd.rdd$$anonfun$reduce$1.apply(rdd。scala:1025)在org.apache.spark.rdd.rddoperationscope$.withscope(rddoperationscope。scala:150)在org.apache.spark.rdd.rddoperationscope$.withscope(rddoperationscope。scala:111)在org.apache.spark.rdd.rdd.withscope(rdd。scala:316)在org.apache.spark.rdd.rdd.reduce(rdd。scala:1007)在org.apache.spark.rdd.rdd$$anonfun$treeaggegate$1.apply(rdd。scala:1150)在org.apache.spark.rdd.rddoperationscope$.withscope(rddoperationscope。scala:150)在org.apache.spark.rdd.rddoperationscope$.withscope(rddoperationscope。scala:111)在org.apache.spark.rdd.rdd.withscope(rdd。scala:316)在org.apache.spark.rdd.rdd.treeaggregate(rdd。scala:1127)位于org.apache.spark.sql.execution.datasources.json.inferschema$.infer(inferschema。scala:65)在org.apache.spark.sql.execution.datasources.json.jsonrelation$$anonfun$4.apply(jsonrelation)。scala:114)位于org.apache.spark.sql.execution.datasources.json.jsonrelation$$anonfun$4.apply(jsonrelation)。scala:109)在scala.option.getorelse(option。scala:120)在org.apache.spark.sql.execution.datasources.json.jsonrelation.dataschema$lzycompute(jsonrelation)。scala:109)位于org.apache.spark.sql.execution.datasources.json.jsonrelation.dataschema(jsonrelation)。scala:108)在org.apache.spark.sql.sources.hadoopfsrelation.schema$lzycompute(interfaces。scala:636)在org.apache.spark.sql.sources.hadoopfsrelation.schema(接口。scala:635)在org.apache.spark.sql.execution.datasources.logicalrelation。scala:37)位于org.apache.spark.sql.dataframereader.load(dataframereader。scala:125)位于org.apache.spark.sql.dataframereader.load(dataframereader。scala:109)在org.apache.spark.sql.dataframereader.json(dataframereader。scala:244)在com.nucleus.spark.mlibpoc.main(mlibpoc。java:44)原因:java.io.filenotfoundexception:file:/d:/workspace2/sparkhadoopproject/src/com/nucleus/spark/generated.json不存在于org.apache.hadoop.fs.rawlocalfilesystem.deprecatedgetfilestatus(rawlocalfilesystem)。java:534)在org.apache.hadoop.fs.rawlocalfilesystem.getfilelinkstatusinternal(rawlocalfilesystem)上。java:747)位于org.apache.hadoop.fs.rawlocalfilesystem.getfilestatus(rawlocalfilesystem)。java:524)在org.apache.hadoop.fs.filterfilesystem.getfilestatus(filterfilesystem。java:409)在org.apache.hadoop.fs.checksumfisystem$checksumfsinputchecker.(checksumfisystem。java:140)在org.apache.hadoop.fs.checksumfilesystem.open(checksumfilesystem。java:341)在org.apache.hadoop.fs.filesystem.open(filesystem。java:766)在org.apache.hadoop.mapred.linerecordreader.(linerecordreader。java:108)在org.apache.hadoop.mapred.textinputformat.getrecordreader(textinputformat。java:67)在org.apache.spark.rdd.hadooprdd$$anon$1。scala:237)在org.apache.spark.rdd.hadooprdd.compute(hadooprdd。scala:208)在org.apache.spark.rdd.hadooprdd.compute(hadooprdd。scala:101)在org.apache.spark.rdd.rdd.computeorreadcheckpoint(rdd。scala:306)在org.apache.spark.rdd.rdd.iterator(rdd。scala:270)在org.apache.spark.rdd.mappartitionsrdd.compute(mappartitionsrdd。scala:38)在org.apache.spark.rdd.rdd.computeorreadcheckpoint(rdd。scala:306)在org.apache.spark.rdd.rdd.iterator(rdd。scala:270)在org.apache.spark.rdd.mappartitionsrdd.compute(mappartitionsrdd。scala:38)在org.apache.spark.rdd.rdd.computeorreadcheckpoint(rdd。scala:306)在org.apache.spark.rdd.rdd.iterator(rdd。scala:270)在org.apache.spark.rdd.mappartitionsrdd.compute(mappartitionsrdd。scala:38)在org.apache.spark.rdd.rdd.computeorreadcheckpoint(rdd。scala:306)在org.apache.spark.rdd.rdd.iterator(rdd。scala:270)在org.apache.spark.scheduler.resulttask.runtask(resulttask。scala:66)在org.apache.spark.scheduler.task.run(task。scala:89)在org.apache.spark.executor.executor$taskrunner.run(executor。scala:227)位于java.util.concurrent.threadpoolexecutor.runworker(threadpoolexecutor。java:1142)在java.util.concurrent.threadpoolexecutor$worker.run(threadpoolexecutor。java:617)在java.lang.thread.run(线程。java:745)

piwo6bdm

piwo6bdm1#

这在spark 1.6上适用:

scala> val jtex = sqlContext.read.json("file:///opt/test.json")
    jtex: org.apache.spark.sql.DataFrame = [_corrupt_record: string, age: string, id: string, name: string]

    scala> val jtex = sqlContext.read.format("json").option("samplingRatio", "1.0").load("file:///opt/test.json")
    jtex: org.apache.spark.sql.DataFrame = [age: string, id: string, name: string]

    scala> jtex.show()
    +---+----+-------+
    |age|  id|   name|
    +---+----+-------+
    | 25|1201| satish|
    | 28|1202|krishna|
    | 39|1203|  amith|
    | 23|1204|  javed|
    | 23|1205| prudvi|
    +---+----+-------+

相关问题