在hdfs(java)上将pdf文件转换为文本

kpbpu008  于 2021-06-01  发布在  Hadoop
关注(0)|答案(1)|浏览(874)

这个问题在这里已经有答案了

为什么我在java中遇到noclassdeffounderror(28个答案)
三年前关门了。
在这节课上,我写得太多了 PdfInputFormatFileInputFormat 班级。此类返回的对象 PdfRecordReader 类,该类正在执行所有pdf转换。我在这里面临一个错误。
我将在eclipse中创建jar,方法是:
工具>eclipse-导出方法>导出>创建jar。
我在jar中选择包所需的库。
我正在使用以下命令执行jar:

hadoop jar /home/tcs/converter.jar com.amal.pdf.PdfInputDriver /user/tcs/wordcountfile.pdf /user/convert

运行此命令后,出现以下异常:

17/06/09 09:26:51 WARN mapred.LocalJobRunner: job_local1466878685_0001
java.lang.Exception: java.lang.NoClassDefFoundError: org/apache/fontbox/cmap/CMapParser
at org.apache.hadoop.mapred.LocalJobRunner$Job.runTasks(LocalJobRunner.java:489)
at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:549)
Caused by: java.lang.NoClassDefFoundError: org/apache/fontbox/cmap/CMapParser
at org.apache.pdfbox.pdmodel.font.PDFont.parseCmap(PDFont.java:548)
at org.apache.pdfbox.pdmodel.font.PDFont.encode(PDFont.java:383)
at org.apache.pdfbox.util.PDFStreamEngine.processEncodedText(PDFStreamEngine.java:372)
at org.apache.pdfbox.util.operator.ShowTextGlyph.process(ShowTextGlyph.java:61)
at org.apache.pdfbox.util.PDFStreamEngine.processOperator(PDFStreamEngine.java:552)
at org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:248)
at org.apache.pdfbox.util.PDFStreamEngine.processStream(PDFStreamEngine.java:207)
at org.apache.pdfbox.util.PDFTextStripper.processPage(PDFTextStripper.java:367)
at org.apache.pdfbox.util.PDFTextStripper.processPages(PDFTextStripper.java:291)
at org.apache.pdfbox.util.PDFTextStripper.writeText(PDFTextStripper.java:247)
at org.apache.pdfbox.util.PDFTextStripper.getText(PDFTextStripper.java:180)
at com.amal.pdf.PdfRecordReader.initialize(PdfRecordReader.java:43)
at org.apache.hadoop.mapred.MapTask$NewTrackingRecordReader.initialize(MapTask.java:548)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:786)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
at org.apache.hadoop.mapred.LocalJobRunner$Job$MapTaskRunnable.run(LocalJobRunner.java:270)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.ClassNotFoundException: org.apache.fontbox.cmap.CMapParser
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:331)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 21 more
17/06/09 09:26:52 INFO mapreduce.Job: Job job_local1466878685_0001 failed with state FAILED due to: NA
17/06/09 09:26:52 INFO mapreduce.Job: Counters: 0
false

代码如下:

PdfRecordReader class(code)
package com.amal.pdf;

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;

public class PdfRecordReader extends RecordReader<Object, Object> 
    {
    private String[] lines = null;
    private LongWritable key = null;
    private Text value = null;
    @Override
    public void initialize(InputSplit genericSplit, TaskAttemptContext context)
            throws IOException, InterruptedException {
        FileSplit split = (FileSplit) genericSplit;
        Configuration job = context.getConfiguration();
        final Path file = split.getPath();
        /*
         * The below code contains the logic for opening the file and seek to
         * the start of the split. Here we are applying the Pdf Parsing logic
         */
        FileSystem fs = file.getFileSystem(job);
        FSDataInputStream fileIn = fs.open(split.getPath());
        PDDocument pdf = null;
        String parsedText = null;
        PDFTextStripper stripper;
        pdf = PDDocument.load(fileIn);
        stripper = new PDFTextStripper();
    //getting exception because of this line****
        parsedText = stripper.getText(pdf);
        this.lines = parsedText.split("\n");    }
    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
        if (key == null) {
            key = new LongWritable();
            key.set(1);
            value = new Text();
            value.set(lines[0]);
        } else {
            int temp = (int) key.get();
            if (temp < (lines.length - 1)) {
                int count = (int) key.get();
                value = new Text();
                value.set(lines[count]);
                count = count + 1;
                key = new LongWritable(count);
            } else {
                return false;
            }
        }
        if (key == null || value == null) {
            return false;
        } else {
            return true;
        }
    }
    @Override
    public LongWritable getCurrentKey() throws IOException,
            InterruptedException {
        return key;
    }
    @Override
    public Text getCurrentValue() throws IOException, InterruptedException {
        return value;
    }
    @Override
    public float getProgress() throws IOException, InterruptedException {
        return 0;
    }
    @Override
    public void close() throws IOException {
    }
}

//注意:因为它是针对hadoop环境的,所以使用eclipse不会使//runnable jar用于此项目。//是否仍然有导出这个项目作为一个可运行的jar。
//我需要帮助来理解我做错了什么。

9njqaruj

9njqaruj1#

错误是因为hadoop找不到 org.apache.fontbox.cmap.CMapParser 类,它应该是您在代码中导入的外部库。
外部依赖jar没有与用于hadoop命令的jar打包,因此 hadoop 系统在中找不到jar hdfs . 这是因为当我们运行hadoop命令代码(jar)时,它们被分发到hdfs集群中的数据所在位置,因此没有找到依赖jar。
您可以遵循两种解决方案:
1)可以使用hadoop命令将外部jar包含为

hadoop jar /home/tcs/converter.jar com.amal.pdf.PdfInputDriver -libjars <path to external jars comma separated> /user/tcs/wordcountfile.pdf /user/convert

2) 或者你可以使用shade插件创建一个uberjar,在你自己的jar中包含所有相关的库。

相关问题