org.apache.tika.Tika.parse()方法的使用及代码示例

x33g5p2x  于2022-01-29 转载在 其他  
字(7.8k)|赞(0)|评价(0)|浏览(267)

本文整理了Java中org.apache.tika.Tika.parse()方法的一些代码示例,展示了Tika.parse()的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。Tika.parse()方法的具体详情如下:
包路径:org.apache.tika.Tika
类名称:Tika
方法名:parse

Tika.parse介绍

[英]Parses the given file and returns the extracted text content.
[中]解析给定文件并返回提取的文本内容。

代码示例

代码示例来源:origin: apache/tika

public static void parseToReaderExample() throws Exception {
  File document = new File("example.doc");
  try (Reader reader = new Tika().parse(document)) {
    char[] buffer = new char[1000];
    int n = reader.read(buffer);
    while (n != -1) {
      System.out.append(CharBuffer.wrap(buffer, 0, n));
      n = reader.read(buffer);
    }
  }
}

代码示例来源:origin: apache/tika

/**
 * Parses the file at the given path and returns the extracted text content.
 *
 * @param path the path of the file to be parsed
 * @return extracted text content
 * @throws IOException if the file can not be read or parsed
 */
public Reader parse(Path path) throws IOException {
  return parse(path, new Metadata());
}

代码示例来源:origin: apache/tika

/**
 * Parses the given file and returns the extracted text content.
 *
 * @param file the file to be parsed
 * @return extracted text content
 * @throws IOException if the file can not be read or parsed
 * @see #parse(Path)
 */
public Reader parse(File file) throws IOException {
  return parse(file, new Metadata());
}

代码示例来源:origin: apache/tika

/**
 * Parses the given document and returns the extracted text content.
 * <p>
 * The returned reader will be responsible for closing the given stream.
 * The stream and any associated resources will be closed at or before
 * the time when the {@link Reader#close()} method is called.
 *
 * @param stream the document to be parsed
 * @return extracted text content
 * @throws IOException if the document can not be read or parsed
 */
public Reader parse(InputStream stream) throws IOException {
  return parse(stream, new Metadata());
}

代码示例来源:origin: apache/tika

public void indexDocument(File file) throws Exception {
    try (Reader fulltext = tika.parse(file)) {
      Document document = new Document();
      document.add(new TextField("filename", file.getName(), Store.YES));
      document.add(new TextField("fulltext", fulltext));
      writer.addDocument(document);
    }
  }
}

代码示例来源:origin: apache/tika

/**
 * Parses the file at the given path and returns the extracted text content.
 * <p>
 * Metadata information extracted from the document is returned in 
 *  the supplied metadata instance.
 *
 * @param path the path of the file to be parsed
 * @param metadata where document's metadata will be populated
 * @return extracted text content
 * @throws IOException if the file can not be read or parsed
 */
public Reader parse(Path path, Metadata metadata) throws IOException {
  InputStream stream = TikaInputStream.get(path, metadata);
  return parse(stream, metadata);
}

代码示例来源:origin: apache/tika

/**
 * Parses the given file and returns the extracted text content.
 * <p>
 * Metadata information extracted from the document is returned in 
 *  the supplied metadata instance.
 *
 * @param file the file to be parsed
 * @param metadata where document's metadata will be populated
 * @return extracted text content
 * @throws IOException if the file can not be read or parsed
 * @see #parse(Path)
 */
public Reader parse(File file, Metadata metadata) throws IOException {
  @SuppressWarnings("deprecation")
  InputStream stream = TikaInputStream.get(file, metadata);
  return parse(stream, metadata);
}

代码示例来源:origin: apache/tika

/**
 * Parses the resource at the given URL and returns the extracted
 * text content.
 *
 * @param url the URL of the resource to be parsed
 * @return extracted text content
 * @throws IOException if the resource can not be read or parsed
 */
public Reader parse(URL url) throws IOException {
  Metadata metadata = new Metadata();
  InputStream stream = TikaInputStream.get(url, metadata);
  return parse(stream, metadata);
}

代码示例来源:origin: apache/tika

public void indexContentSpecificMet(File file) throws Exception {
  Metadata met = new Metadata();
  try (InputStream is = new FileInputStream(file)) {
    tika.parse(is, met);
    Document document = new Document();
    for (String key : met.names()) {
      String[] values = met.getValues(key);
      for (String val : values) {
        document.add(new TextField(key, val, Store.YES));
      }
      writer.addDocument(document);
    }
  }
}

代码示例来源:origin: apache/tika

.equals(metadata.get(Metadata.CONTENT_TYPE))
? new InputStreamReader(inputStream, StandardCharsets.UTF_8)
: secondaryParser.parse(inputStream);

代码示例来源:origin: apache/tika

reader = new InputStreamReader(inputStream, StandardCharsets.UTF_8);
} else {
  reader = secondaryParser.parse(inputStream);

代码示例来源:origin: apache/tika

private Metadata getMetadata(String name) throws TikaException, IOException, SAXException {
    URL url = this.getClass().getResource("/org/apache/tika/config/"+name);
    assertNotNull("couldn't find: "+name, url);
    TikaConfig tikaConfig = new TikaConfig(url);
    Tika tika = new Tika(tikaConfig);
    Metadata metadata = new Metadata();
    tika.parse(url.openStream(), metadata);
    return metadata;
  }
}

代码示例来源:origin: apache/tika

public void indexWithDublinCore(File file) throws Exception {
    Metadata met = new Metadata();
    met.add(TikaCoreProperties.CREATOR, "Manning");
    met.add(TikaCoreProperties.CREATOR, "Tika in Action");
    met.set(TikaCoreProperties.CREATED, new Date());
    met.set(TikaCoreProperties.FORMAT, tika.detect(file));
    met.set(DublinCore.SOURCE, file.toURI().toURL().toString());
    met.add(TikaCoreProperties.SUBJECT, "File");
    met.add(TikaCoreProperties.SUBJECT, "Indexing");
    met.add(TikaCoreProperties.SUBJECT, "Metadata");
    met.set(Property.externalClosedChoise(TikaCoreProperties.RIGHTS.getName(), "public",
        "private"), "public");
    try (InputStream is = new FileInputStream(file)) {
      tika.parse(is, met);
      Document document = new Document();
      for (String key : met.names()) {
        String[] values = met.getValues(key);
        for (String val : values) {
          document.add(new TextField(key, val, Store.YES));
        }
        writer.addDocument(document);
      }
    }
  }
}

代码示例来源:origin: apache/tika

@Test
  public void testInitializableParser() throws Exception {
    URL configFileUrl = getClass().getClassLoader().getResource(TIKA_CFG_FILE);
    assert configFileUrl != null;
    TikaConfig config = new TikaConfig(configFileUrl);
    Tika tika = new Tika(config);
    Metadata md = new Metadata();
    tika.parse(TikaInputStream.get("someString".getBytes(StandardCharsets.ISO_8859_1)), md);
    assertEquals("5", md.get(DummyInitializableParser.SUM_FIELD));
  }
}

代码示例来源:origin: org.apache.tika/tika-core

/**
 * Parses the file at the given path and returns the extracted text content.
 *
 * @param path the path of the file to be parsed
 * @return extracted text content
 * @throws IOException if the file can not be read or parsed
 */
public Reader parse(Path path) throws IOException {
  return parse(path, new Metadata());
}

代码示例来源:origin: org.apache.tika/tika-core

/**
 * Parses the given file and returns the extracted text content.
 *
 * @param file the file to be parsed
 * @return extracted text content
 * @throws IOException if the file can not be read or parsed
 * @see #parse(Path)
 */
public Reader parse(File file) throws IOException {
  return parse(file, new Metadata());
}

代码示例来源:origin: com.github.lafa.tikaNoExternal/tika-core

/**
 * Parses the file at the given path and returns the extracted text content.
 *
 * @param path the path of the file to be parsed
 * @return extracted text content
 * @throws IOException if the file can not be read or parsed
 */
public Reader parse(Path path) throws IOException {
  return parse(path, new Metadata());
}

代码示例来源:origin: com.github.lafa.tikaNoExternal/tika-core

/**
 * Parses the given file and returns the extracted text content.
 *
 * @param file the file to be parsed
 * @return extracted text content
 * @throws IOException if the file can not be read or parsed
 * @see #parse(Path)
 */
public Reader parse(File file) throws IOException {
  return parse(file, new Metadata());
}

代码示例来源:origin: org.apache.tika/tika-core

/**
 * Parses the resource at the given URL and returns the extracted
 * text content.
 *
 * @param url the URL of the resource to be parsed
 * @return extracted text content
 * @throws IOException if the resource can not be read or parsed
 */
public Reader parse(URL url) throws IOException {
  Metadata metadata = new Metadata();
  InputStream stream = TikaInputStream.get(url, metadata);
  return parse(stream, metadata);
}

代码示例来源:origin: com.github.lafa.tikaNoExternal/tika-core

/**
 * Parses the resource at the given URL and returns the extracted
 * text content.
 *
 * @param url the URL of the resource to be parsed
 * @return extracted text content
 * @throws IOException if the resource can not be read or parsed
 */
public Reader parse(URL url) throws IOException {
  Metadata metadata = new Metadata();
  InputStream stream = TikaInputStream.get(url, metadata);
  return parse(stream, metadata);
}

相关文章