org.apache.tika.Tika类的使用及代码示例

x33g5p2x  于2022-01-29 转载在 其他  
字(9.5k)|赞(0)|评价(0)|浏览(571)

本文整理了Java中org.apache.tika.Tika类的一些代码示例,展示了Tika类的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。Tika类的具体详情如下:
包路径:org.apache.tika.Tika
类名称:Tika

Tika介绍

[英]Facade class for accessing Tika functionality. This class hides much of the underlying complexity of the lower level Tika classes and provides simple methods for many common parsing and type detection operations.
[中]用于访问Tika功能的Facade类。该类隐藏了较低级别Tika类的许多潜在复杂性,并为许多常见的解析和类型检测操作提供了简单的方法。

代码示例

代码示例来源:origin: apache/tika

public TrecDocument summarize(File file) throws FileNotFoundException,
    IOException, TikaException {
  Tika tika = new Tika();
  Metadata met = new Metadata();
  String contents = tika.parseToString(new FileInputStream(file), met);
  return new TrecDocument(met.get(TikaCoreProperties.RESOURCE_NAME_KEY), contents,
      met.getDate(TikaCoreProperties.CREATED));
}

代码示例来源:origin: apache/tika

/**
 * Detects the media type of the file at the given path. The type
 * detection is based on the document content and a potential known
 * file extension.
 * <p>
 * Use the {@link #detect(String)} method when you want to detect the
 * type of the document without actually accessing the file.
 *
 * @param path the path of the file
 * @return detected media type
 * @throws IOException if the file can not be read
 */
public String detect(Path path) throws IOException {
  Metadata metadata = new Metadata();
  try (InputStream stream = TikaInputStream.get(path, metadata)) {
    return detect(stream, metadata);
  }
}

代码示例来源:origin: apache/tika

/**
 * Parses the file at the given path and returns the extracted text content.
 *
 * @param path the path of the file to be parsed
 * @return extracted text content
 * @throws IOException if the file can not be read or parsed
 */
public Reader parse(Path path) throws IOException {
  return parse(path, new Metadata());
}

代码示例来源:origin: stackoverflow.com

Tika tika = new Tika();
File file = ...
String mimeType = tika.detect(file);

代码示例来源:origin: apache/tika

private void version() {
  System.out.println(new Tika().toString());
}

代码示例来源:origin: apache/tika

public static void main(String[] args) throws Exception {
    // Create a Tika instance with the default configuration
    Tika tika = new Tika();

    // Parse all given files and print out the extracted
    // text content
    for (String file : args) {
      String text = tika.parseToString(new File(file));
      System.out.print(text);
    }
  }
}

代码示例来源:origin: apache/tika

/**
 * Parses the file at the given path and returns the extracted text content.
 * <p>
 * To avoid unpredictable excess memory use, the returned string contains
 * only up to {@link #getMaxStringLength()} first characters extracted
 * from the input document. Use the {@link #setMaxStringLength(int)}
 * method to adjust this limitation.
 *
 * @param path the path of the file to be parsed
 * @return extracted text content
 * @throws IOException if the file can not be read
 * @throws TikaException if the file can not be parsed
 */
public String parseToString(Path path) throws IOException, TikaException {
  Metadata metadata = new Metadata();
  InputStream stream = TikaInputStream.get(path, metadata);
  return parseToString(stream, metadata);
}

代码示例来源:origin: apache/tika

/**
 * Detects the media type of the given document. The type detection is
 * based on the content of the given document stream and the name of the
 * document.
 * <p>
 * If the document stream supports the
 * {@link InputStream#markSupported() mark feature}, then the stream is
 * marked and reset to the original position before this method returns.
 * Only a limited number of bytes are read from the stream.
 * <p>
 * The given document stream is <em>not</em> closed by this method.
 *
 * @since Apache Tika 0.9
 * @param stream the document stream
 * @param name document name
 * @return detected media type
 * @throws IOException if the stream can not be read
 */
public String detect(InputStream stream, String name) throws IOException {
  Metadata metadata = new Metadata();
  metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
  return detect(stream, metadata);
}

代码示例来源:origin: apache/tika

public void indexWithDublinCore(File file) throws Exception {
    Metadata met = new Metadata();
    met.add(TikaCoreProperties.CREATOR, "Manning");
    met.add(TikaCoreProperties.CREATOR, "Tika in Action");
    met.set(TikaCoreProperties.CREATED, new Date());
    met.set(TikaCoreProperties.FORMAT, tika.detect(file));
    met.set(DublinCore.SOURCE, file.toURI().toURL().toString());
    met.add(TikaCoreProperties.SUBJECT, "File");
    met.add(TikaCoreProperties.SUBJECT, "Indexing");
    met.add(TikaCoreProperties.SUBJECT, "Metadata");
    met.set(Property.externalClosedChoise(TikaCoreProperties.RIGHTS.getName(), "public",
        "private"), "public");
    try (InputStream is = new FileInputStream(file)) {
      tika.parse(is, met);
      Document document = new Document();
      for (String key : met.names()) {
        String[] values = met.getValues(key);
        for (String val : values) {
          document.add(new TextField(key, val, Store.YES));
        }
        writer.addDocument(document);
      }
    }
  }
}

代码示例来源:origin: stackoverflow.com

Tika tika = new Tika();
Metadata metadata = new Metadata(); 
metadata.set(Metadata.RESOURCE_NAME_KEY, "myfile.name");
String text = tika.parseToString(new File("myfile.name"));

代码示例来源:origin: apache/tika

@Test
  public void testInitializableParser() throws Exception {
    URL configFileUrl = getClass().getClassLoader().getResource(TIKA_CFG_FILE);
    assert configFileUrl != null;
    TikaConfig config = new TikaConfig(configFileUrl);
    Tika tika = new Tika(config);
    Metadata md = new Metadata();
    tika.parse(TikaInputStream.get("someString".getBytes(StandardCharsets.ISO_8859_1)), md);
    assertEquals("5", md.get(DummyInitializableParser.SUM_FIELD));
  }
}

代码示例来源:origin: apache/tika

/**
 * Parses the resource at the given URL and returns the extracted
 * text content.
 *
 * @param url the URL of the resource to be parsed
 * @return extracted text content
 * @throws IOException if the resource can not be read or parsed
 */
public Reader parse(URL url) throws IOException {
  Metadata metadata = new Metadata();
  InputStream stream = TikaInputStream.get(url, metadata);
  return parse(stream, metadata);
}

代码示例来源:origin: apache/tika

/**
 * Detects the media type of the given document. The type detection is
 * based on the content of the given document stream.
 * <p>
 * If the document stream supports the
 * {@link InputStream#markSupported() mark feature}, then the stream is
 * marked and reset to the original position before this method returns.
 * Only a limited number of bytes are read from the stream.
 * <p>
 * The given document stream is <em>not</em> closed by this method.
 *
 * @param stream the document stream
 * @return detected media type
 * @throws IOException if the stream can not be read
 */
public String detect(InputStream stream) throws IOException {
  return detect(stream, new Metadata());
}

代码示例来源:origin: apache/tika

public void indexContentSpecificMet(File file) throws Exception {
  Metadata met = new Metadata();
  try (InputStream is = new FileInputStream(file)) {
    tika.parse(is, met);
    Document document = new Document();
    for (String key : met.names()) {
      String[] values = met.getValues(key);
      for (String val : values) {
        document.add(new TextField(key, val, Store.YES));
      }
      writer.addDocument(document);
    }
  }
}

代码示例来源:origin: rnewson/couchdb-lucene

public void parse(final InputStream in, final String contentType, final String fieldName, final Document doc)
    throws IOException {
  final Metadata md = new Metadata();
  md.set(HttpHeaders.CONTENT_TYPE, contentType);
  try {
    // Add body text.
    doc.add(text(fieldName, tika.parseToString(in, md), false));
  } catch (final IOException e) {
    log.warn("Failed to index an attachment.", e);
    return;
  } catch (final TikaException e) {
    log.warn("Failed to parse an attachment.", e);
    return;
  }
  // Add DC attributes.
  addDublinCoreAttributes(md, doc);
}

代码示例来源:origin: apache/tika

/**
 * Parses the given document and returns the extracted text content.
 * The given input stream is closed by this method.
 * <p>
 * To avoid unpredictable excess memory use, the returned string contains
 * only up to {@link #getMaxStringLength()} first characters extracted
 * from the input document. Use the {@link #setMaxStringLength(int)}
 * method to adjust this limitation.
 * <p>
 * <strong>NOTE:</strong> Unlike most other Tika methods that take an
 * {@link InputStream}, this method will close the given stream for
 * you as a convenience. With other methods you are still responsible
 * for closing the stream or a wrapper instance returned by Tika.
 *
 * @param stream the document to be parsed
 * @return extracted text content
 * @throws IOException if the document can not be read
 * @throws TikaException if the document can not be parsed
 */
public String parseToString(InputStream stream)
    throws IOException, TikaException {
  return parseToString(stream, new Metadata());
}

代码示例来源:origin: org.apache.oodt/cas-metadata

Tika tika = new Tika();
tika.parse(is, tikaMet); // extract metadata
tikaMet.add("content", tika.parseToString(file)); // extract content
    + tikaMet.names().length + "]");
for (String key : tikaMet.names()) {
  met.addMetadata(key, StringEscapeUtils.escapeXml(tikaMet.get(key)));
  LOG.fine("Added tika met key [" + key + "] with value ["

代码示例来源:origin: apache/tika

private Metadata getMetadata(String name) throws TikaException, IOException, SAXException {
    URL url = this.getClass().getResource("/org/apache/tika/config/"+name);
    assertNotNull("couldn't find: "+name, url);
    TikaConfig tikaConfig = new TikaConfig(url);
    Tika tika = new Tika(tikaConfig);
    Metadata metadata = new Metadata();
    tika.parse(url.openStream(), metadata);
    return metadata;
  }
}

代码示例来源:origin: apache/tika

if (MediaType.TEXT_PLAIN.toString().equals(metadata.get(Metadata.CONTENT_TYPE))) {
  reader = new InputStreamReader(inputStream, StandardCharsets.UTF_8);
} else {
  reader = secondaryParser.parse(inputStream);
  double predictAuthorAge = getAgePredictorClient().predictAge(IOUtils.toString(reader));
  metadata.add(MD_KEY_ESTIMATED_AGE, Double.toString(predictAuthorAge) );

代码示例来源:origin: apache/tika

/**
 * Parses the file at the given path and returns the extracted text content.
 * <p>
 * Metadata information extracted from the document is returned in 
 *  the supplied metadata instance.
 *
 * @param path the path of the file to be parsed
 * @param metadata where document's metadata will be populated
 * @return extracted text content
 * @throws IOException if the file can not be read or parsed
 */
public Reader parse(Path path, Metadata metadata) throws IOException {
  InputStream stream = TikaInputStream.get(path, metadata);
  return parse(stream, metadata);
}

相关文章