[英]Facade class for accessing Tika functionality. This class hides much of the underlying complexity of the lower level Tika classes and provides simple methods for many common parsing and type detection operations.


代码示例来源:origin: apache/tika

public TrecDocument summarize(File file) throws FileNotFoundException,
    IOException, TikaException {
  Tika tika = new Tika();
  Metadata met = new Metadata();
  String contents = tika.parseToString(new FileInputStream(file), met);
  return new TrecDocument(met.get(TikaCoreProperties.RESOURCE_NAME_KEY), contents,

代码示例来源:origin: apache/tika

 * Detects the media type of the file at the given path. The type
 * detection is based on the document content and a potential known
 * file extension.
 * <p>
 * Use the {@link #detect(String)} method when you want to detect the
 * type of the document without actually accessing the file.
 * @param path the path of the file
 * @return detected media type
 * @throws IOException if the file can not be read
public String detect(Path path) throws IOException {
  Metadata metadata = new Metadata();
  try (InputStream stream = TikaInputStream.get(path, metadata)) {
    return detect(stream, metadata);

代码示例来源:origin: apache/tika

 * Parses the file at the given path and returns the extracted text content.
 * @param path the path of the file to be parsed
 * @return extracted text content
 * @throws IOException if the file can not be read or parsed
public Reader parse(Path path) throws IOException {
  return parse(path, new Metadata());


Tika tika = new Tika();
File file = ...
String mimeType = tika.detect(file);

代码示例来源:origin: apache/tika

private void version() {
  System.out.println(new Tika().toString());

代码示例来源:origin: apache/tika

public static void main(String[] args) throws Exception {
    // Create a Tika instance with the default configuration
    Tika tika = new Tika();

    // Parse all given files and print out the extracted
    // text content
    for (String file : args) {
      String text = tika.parseToString(new File(file));

代码示例来源:origin: apache/tika

 * Parses the file at the given path and returns the extracted text content.
 * <p>
 * To avoid unpredictable excess memory use, the returned string contains
 * only up to {@link #getMaxStringLength()} first characters extracted
 * from the input document. Use the {@link #setMaxStringLength(int)}
 * method to adjust this limitation.
 * @param path the path of the file to be parsed
 * @return extracted text content
 * @throws IOException if the file can not be read
 * @throws TikaException if the file can not be parsed
public String parseToString(Path path) throws IOException, TikaException {
  Metadata metadata = new Metadata();
  InputStream stream = TikaInputStream.get(path, metadata);
  return parseToString(stream, metadata);

代码示例来源:origin: apache/tika

 * Detects the media type of the given document. The type detection is
 * based on the content of the given document stream and the name of the
 * document.
 * <p>
 * If the document stream supports the
 * {@link InputStream#markSupported() mark feature}, then the stream is
 * marked and reset to the original position before this method returns.
 * Only a limited number of bytes are read from the stream.
 * <p>
 * The given document stream is <em>not</em> closed by this method.
 * @since Apache Tika 0.9
 * @param stream the document stream
 * @param name document name
 * @return detected media type
 * @throws IOException if the stream can not be read
public String detect(InputStream stream, String name) throws IOException {
  Metadata metadata = new Metadata();
  metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
  return detect(stream, metadata);

代码示例来源:origin: apache/tika

public void indexWithDublinCore(File file) throws Exception {
    Metadata met = new Metadata();
    met.add(TikaCoreProperties.CREATOR, "Manning");
    met.add(TikaCoreProperties.CREATOR, "Tika in Action");
    met.set(TikaCoreProperties.CREATED, new Date());
    met.set(TikaCoreProperties.FORMAT, tika.detect(file));
    met.set(DublinCore.SOURCE, file.toURI().toURL().toString());
    met.add(TikaCoreProperties.SUBJECT, "File");
    met.add(TikaCoreProperties.SUBJECT, "Indexing");
    met.add(TikaCoreProperties.SUBJECT, "Metadata");
    met.set(Property.externalClosedChoise(TikaCoreProperties.RIGHTS.getName(), "public",
        "private"), "public");
    try (InputStream is = new FileInputStream(file)) {
      tika.parse(is, met);
      Document document = new Document();
      for (String key : met.names()) {
        String[] values = met.getValues(key);
        for (String val : values) {
          document.add(new TextField(key, val, Store.YES));


Tika tika = new Tika();
Metadata metadata = new Metadata(); 
metadata.set(Metadata.RESOURCE_NAME_KEY, "");
String text = tika.parseToString(new File(""));

代码示例来源:origin: apache/tika

  public void testInitializableParser() throws Exception {
    URL configFileUrl = getClass().getClassLoader().getResource(TIKA_CFG_FILE);
    assert configFileUrl != null;
    TikaConfig config = new TikaConfig(configFileUrl);
    Tika tika = new Tika(config);
    Metadata md = new Metadata();
    tika.parse(TikaInputStream.get("someString".getBytes(StandardCharsets.ISO_8859_1)), md);
    assertEquals("5", md.get(DummyInitializableParser.SUM_FIELD));

代码示例来源:origin: apache/tika

 * Parses the resource at the given URL and returns the extracted
 * text content.
 * @param url the URL of the resource to be parsed
 * @return extracted text content
 * @throws IOException if the resource can not be read or parsed
public Reader parse(URL url) throws IOException {
  Metadata metadata = new Metadata();
  InputStream stream = TikaInputStream.get(url, metadata);
  return parse(stream, metadata);

代码示例来源:origin: apache/tika

 * Detects the media type of the given document. The type detection is
 * based on the content of the given document stream.
 * <p>
 * If the document stream supports the
 * {@link InputStream#markSupported() mark feature}, then the stream is
 * marked and reset to the original position before this method returns.
 * Only a limited number of bytes are read from the stream.
 * <p>
 * The given document stream is <em>not</em> closed by this method.
 * @param stream the document stream
 * @return detected media type
 * @throws IOException if the stream can not be read
public String detect(InputStream stream) throws IOException {
  return detect(stream, new Metadata());

代码示例来源:origin: apache/tika

public void indexContentSpecificMet(File file) throws Exception {
  Metadata met = new Metadata();
  try (InputStream is = new FileInputStream(file)) {
    tika.parse(is, met);
    Document document = new Document();
    for (String key : met.names()) {
      String[] values = met.getValues(key);
      for (String val : values) {
        document.add(new TextField(key, val, Store.YES));

代码示例来源:origin: rnewson/couchdb-lucene

public void parse(final InputStream in, final String contentType, final String fieldName, final Document doc)
    throws IOException {
  final Metadata md = new Metadata();
  md.set(HttpHeaders.CONTENT_TYPE, contentType);
  try {
    // Add body text.
    doc.add(text(fieldName, tika.parseToString(in, md), false));
  } catch (final IOException e) {
    log.warn("Failed to index an attachment.", e);
  } catch (final TikaException e) {
    log.warn("Failed to parse an attachment.", e);
  // Add DC attributes.
  addDublinCoreAttributes(md, doc);

代码示例来源:origin: apache/tika

 * Parses the given document and returns the extracted text content.
 * The given input stream is closed by this method.
 * <p>
 * To avoid unpredictable excess memory use, the returned string contains
 * only up to {@link #getMaxStringLength()} first characters extracted
 * from the input document. Use the {@link #setMaxStringLength(int)}
 * method to adjust this limitation.
 * <p>
 * <strong>NOTE:</strong> Unlike most other Tika methods that take an
 * {@link InputStream}, this method will close the given stream for
 * you as a convenience. With other methods you are still responsible
 * for closing the stream or a wrapper instance returned by Tika.
 * @param stream the document to be parsed
 * @return extracted text content
 * @throws IOException if the document can not be read
 * @throws TikaException if the document can not be parsed
public String parseToString(InputStream stream)
    throws IOException, TikaException {
  return parseToString(stream, new Metadata());

代码示例来源:origin: org.apache.oodt/cas-metadata

Tika tika = new Tika();
tika.parse(is, tikaMet); // extract metadata
tikaMet.add("content", tika.parseToString(file)); // extract content
    + tikaMet.names().length + "]");
for (String key : tikaMet.names()) {
  met.addMetadata(key, StringEscapeUtils.escapeXml(tikaMet.get(key)));
  LOG.fine("Added tika met key [" + key + "] with value ["

代码示例来源:origin: apache/tika

private Metadata getMetadata(String name) throws TikaException, IOException, SAXException {
    URL url = this.getClass().getResource("/org/apache/tika/config/"+name);
    assertNotNull("couldn't find: "+name, url);
    TikaConfig tikaConfig = new TikaConfig(url);
    Tika tika = new Tika(tikaConfig);
    Metadata metadata = new Metadata();
    tika.parse(url.openStream(), metadata);
    return metadata;

代码示例来源:origin: apache/tika

if (MediaType.TEXT_PLAIN.toString().equals(metadata.get(Metadata.CONTENT_TYPE))) {
  reader = new InputStreamReader(inputStream, StandardCharsets.UTF_8);
} else {
  reader = secondaryParser.parse(inputStream);
  double predictAuthorAge = getAgePredictorClient().predictAge(IOUtils.toString(reader));
  metadata.add(MD_KEY_ESTIMATED_AGE, Double.toString(predictAuthorAge) );

代码示例来源:origin: apache/tika

 * Parses the file at the given path and returns the extracted text content.
 * <p>
 * Metadata information extracted from the document is returned in 
 *  the supplied metadata instance.
 * @param path the path of the file to be parsed
 * @param metadata where document's metadata will be populated
 * @return extracted text content
 * @throws IOException if the file can not be read or parsed
public Reader parse(Path path, Metadata metadata) throws IOException {
  InputStream stream = TikaInputStream.get(path, metadata);
  return parse(stream, metadata);
