org.apache.tika.parser.Parser类的使用及代码示例

x33g5p2x  于2022-01-26 转载在 其他  
字(11.4k)|赞(0)|评价(0)|浏览(396)

本文整理了Java中org.apache.tika.parser.Parser类的一些代码示例,展示了Parser类的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。Parser类的具体详情如下:
包路径:org.apache.tika.parser.Parser
类名称:Parser

Parser介绍

[英]Tika parser interface.
[中]Tika解析器接口。

代码示例

代码示例来源:origin: apache/tika

  1. public static void useAutoDetectParser() throws Exception {
  2. InputStream stream = new ByteArrayInputStream(new byte[0]);
  3. ContentHandler handler = new DefaultHandler();
  4. Metadata metadata = new Metadata();
  5. ParseContext context = new ParseContext();
  6. Parser parser = new AutoDetectParser();
  7. parser.parse(stream, handler, metadata, context);
  8. }

代码示例来源:origin: apache/tika

  1. public boolean isSupported(TikaInputStream input) throws IOException {
  2. MediaType type = detector.detect(input, new Metadata());
  3. return parser.getSupportedTypes(new ParseContext()).contains(type);
  4. }

代码示例来源:origin: javasoze/meaningfulweb

  1. private static void parseMeta(Parser parser,InputStream in,Metadata meta,Map<String,String> ogmeta) throws IOException, SAXException, TikaException{
  2. parser.parse(in, new DefaultHandler(), meta, new ParseContext());
  3. String[] propnames = meta.names();
  4. for (String propname : propnames){
  5. String val = meta.get(propname);
  6. ogmeta.put(propname, val);
  7. }
  8. }

代码示例来源:origin: jpotts/alfresco-api-java-examples

  1. InputStream stream = new FileInputStream(file);
  2. try {
  3. Metadata metadata = new Metadata();
  4. ContentHandler handler = new DefaultHandler();
  5. Parser parser = new JpegParser();
  6. ParseContext context = new ParseContext();
  7. metadata.set(Metadata.CONTENT_TYPE, mimeType);
  8. parser.parse(stream, handler, metadata, context);
  9. String lat = metadata.get("geo:lat");
  10. String lon = metadata.get("geo:long");
  11. stream.close();

代码示例来源:origin: apache/cxf

  1. return null;
  2. final Metadata metadata = new Metadata();
  3. metadata.set(HttpHeaders.CONTENT_TYPE, mediaType.toString());
  4. } else {
  5. for (Parser p : parsers) {
  6. if (mediaType != null && !p.getSupportedTypes(context).contains(mediaType)) {
  7. continue;
  8. context = new ParseContext();
  9. if (context.get(Parser.class) == null) {
  10. context.set(Parser.class,
  11. parser instanceof AutoDetectParser ? parser : new AutoDetectParser());
  12. parser.parse(in, handler, metadata, context);
  13. } catch (Exception ex) {
  14. parser.parse(in, handler, metadata, context);
  15. } else {
  16. throw ex;

代码示例来源:origin: apache/tika

  1. ParseContext context = new ParseContext();
  2. BodyContentHandler handler;
  3. Metadata metadata;
  4. metadata = new Metadata();
  5. handler = new BodyContentHandler();
  6. p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
  7. assertEquals("Fell back!", handler.toString());
  8. usedParsers = metadata.getValues("X-Parsed-By");
  9. assertEquals(1, usedParsers.length);
  10. assertEquals(DummyParser.class.getName(), usedParsers[0]);
  11. metadata = new Metadata();
  12. handler = new BodyContentHandler();
  13. p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
  14. assertEquals("Fell back!", handler.toString());
  15. metadata = new Metadata();
  16. handler = new BodyContentHandler();
  17. p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
  18. assertEquals("Fell back!", handler.toString());

代码示例来源:origin: apache/tika

  1. metadata.set(Metadata.CONTENT_TYPE, type);
  2. } else if (entry.getName().equals(META_NAME)) {
  3. meta.parse(zip, new DefaultHandler(), metadata, context);
  4. } else if (entry.getName().endsWith("content.xml")) {
  5. if (content instanceof OpenDocumentContentParser) {
  6. } else {
  7. content.parse(zip, handler, metadata, context);
  8. } else {
  9. content.parse(zip, handler, metadata, context);
  10. EmbeddedDocumentExtractor embeddedDocumentExtractor =
  11. EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
  12. Metadata embeddedMetadata = new Metadata();
  13. embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, entry.getName());

代码示例来源:origin: apache/tika

  1. xhtml.startDocument();
  2. ContentHandler childHandler = new EmbeddedContentHandler(
  3. new BodyContentHandler(xhtml));
  4. type = type.trim();
  5. metadata.set(Metadata.CONTENT_TYPE, type);
  6. } else if (entry.getName().equals("metadata.xml")) {
  7. meta.parse(zip, new DefaultHandler(), metadata, context);
  8. } else if (entry.getName().endsWith(".opf")) {
  9. meta.parse(zip, new DefaultHandler(), metadata, context);
  10. } else if (entry.getName().endsWith(".htm") ||
  11. entry.getName().endsWith(".html") ||
  12. entry.getName().endsWith(".xhtml")) {
  13. content.parse(zip, childHandler, metadata, context);

代码示例来源:origin: gentics/mesh

  1. @Override
  2. public Single<Map<String, String>> getMetadata(InputStream ins) {
  3. return Single.create(sub -> {
  4. Parser parser = new AutoDetectParser();
  5. BodyContentHandler handler = new BodyContentHandler();
  6. Metadata metadata = new Metadata();
  7. ParseContext context = new ParseContext();
  8. try {
  9. parser.parse(ins, handler, metadata, context);
  10. Map<String, String> map = new HashMap<>();
  11. String[] metadataNames = metadata.names();
  12. for (String name : metadataNames) {
  13. map.put(name, metadata.get(name));
  14. }
  15. sub.onSuccess(map);
  16. } catch (Exception e) {
  17. sub.onError(e);
  18. }
  19. // ins.close();
  20. });
  21. }

代码示例来源:origin: org.apache.beam/beam-sdks-java-io-tika

  1. @ProcessElement
  2. public void processElement(ProcessContext c) throws Exception {
  3. ReadableFile file = c.element();
  4. InputStream stream = Channels.newInputStream(file.open());
  5. try (InputStream tikaStream = TikaInputStream.get(stream)) {
  6. Parser parser =
  7. tikaConfig == null ? new AutoDetectParser() : new AutoDetectParser(tikaConfig);
  8. ParseContext context = new ParseContext();
  9. context.set(Parser.class, parser);
  10. Metadata tikaMetadata =
  11. spec.getInputMetadata() != null ? spec.getInputMetadata() : new Metadata();
  12. if (spec.getContentTypeHint() != null) {
  13. tikaMetadata.set(Metadata.CONTENT_TYPE, spec.getContentTypeHint());
  14. }
  15. String location = file.getMetadata().resourceId().toString();
  16. ParseResult res;
  17. ContentHandler tikaHandler = new ToTextContentHandler();
  18. try {
  19. parser.parse(tikaStream, tikaHandler, tikaMetadata, context);
  20. res = ParseResult.success(location, tikaHandler.toString(), tikaMetadata);
  21. } catch (Exception e) {
  22. res = ParseResult.failure(location, tikaHandler.toString(), tikaMetadata, e);
  23. }
  24. c.output(res);
  25. }
  26. }
  27. }

代码示例来源:origin: apache/tika

  1. MediaType.OCTET_STREAM, MediaType.TEXT_PLAIN));
  2. ParseContext context = new ParseContext();
  3. BodyContentHandler handler;
  4. Metadata metadata;
  5. Set<MediaType> types = p.getSupportedTypes(context);
  6. assertEquals(2, types.size());
  7. assertEquals(types.toString(), true, types.contains(MediaType.TEXT_PLAIN));
  8. metadata = new Metadata();
  9. handler = new BodyContentHandler();
  10. p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
  11. assertEquals("Fell back!", handler.toString());
  12. metadata = new Metadata();
  13. handler = new BodyContentHandler();
  14. p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
  15. assertEquals("", handler.toString());

代码示例来源:origin: apache/tika

  1. private void parsePage(byte[] byteObject, Parser htmlParser,
  2. ContentHandler xhtml, ParseContext context) throws TikaException {// throws IOException
  3. InputStream stream = null;
  4. Metadata metadata = new Metadata();
  5. ContentHandler handler = new EmbeddedContentHandler(new BodyContentHandler(xhtml));// -1
  6. try {
  7. stream = new ByteArrayInputStream(byteObject);
  8. htmlParser.parse(stream, handler, metadata, context);
  9. } catch (SAXException e) {
  10. throw new RuntimeException(e);
  11. } catch (IOException e) {
  12. // Pushback overflow from tagsoup
  13. }
  14. }

代码示例来源:origin: apache/tika

  1. @Override
  2. public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext parseContext)
  3. throws IOException, SAXException, TikaException {
  4. TesseractOCRConfig config = parseContext.get(TesseractOCRConfig.class, defaultConfig);
  5. _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new DefaultHandler(), metadata, parseContext);

代码示例来源:origin: NGDATA/lilyproject

  1. BodyContentHandler ch = new BodyContentHandler(woh);
  2. Metadata metadata = new Metadata();
  3. metadata.add(Metadata.CONTENT_TYPE, blob.getMediaType());
  4. if (blob.getName() != null) {
  5. metadata.add(Metadata.RESOURCE_NAME_KEY, blob.getName());
  6. ParseContext parseContext = new ParseContext();
  7. tikaParser.parse(is, ch, metadata, parseContext);
  8. } catch (Throwable t) {
  9. if (woh.isWriteLimitReached(t)) {
  10. String text = ch.toString();
  11. if (text.length() > 0) {
  12. result.add(text);

代码示例来源:origin: apache/tika

  1. String v = toString(obj, c.getType());
  2. if (isRichText(c)) {
  3. BodyContentHandler h = new BodyContentHandler();
  4. Metadata m = new Metadata();
  5. m.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8");
  6. try {
  7. htmlParser.parse(new ByteArrayInputStream(v.getBytes(UTF_8)),
  8. h,
  9. m, parseContext);
  10. handler.characters(h.toString());
  11. } catch (SAXException e) {

代码示例来源:origin: apache/tika

  1. public static void main(String[] args) throws Exception {
  2. ApplicationContext context = new ClassPathXmlApplicationContext(
  3. new String[]{"org/apache/tika/example/spring.xml"});
  4. Parser parser = context.getBean("tika", Parser.class);
  5. parser.parse(new ByteArrayInputStream("Hello, World!".getBytes(UTF_8)),
  6. new WriteOutContentHandler(System.out), new Metadata(),
  7. new ParseContext());
  8. }
  9. }

代码示例来源:origin: apache/tika

  1. metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, filename);
  2. System.out.println("The MIME type (based on filename) is: ["
  3. + mimeRegistry.detect(null, metadata) + "]");
  4. metadata.set(Metadata.CONTENT_TYPE, type.toString());
  5. ContentHandler handler = new BodyContentHandler();
  6. parser.parse(stream, handler, metadata, new ParseContext());

代码示例来源:origin: ViDA-NYU/ache

  1. public ParsedData parse(InputStream stream, String fileName, String contentType) {
  2. BodyContentHandler handler = new BodyContentHandler(MAX_CHARACTERS);
  3. BoilerpipeContentHandler textHandler = new BoilerpipeContentHandler(handler, KeepEverythingExtractor.INSTANCE);
  4. Metadata metadata = createMetadata(fileName, contentType);
  5. ParseContext context = new ParseContext();
  6. try {
  7. parser.parse(stream, textHandler, metadata, context);
  8. Map<String, String> metadataMap = new HashMap<String, String>();
  9. for (String propertyName : metadata.names()) {
  10. metadataMap.put(propertyName, metadata.get(propertyName));
  11. }
  12. return new ParsedData(handler.toString(), metadataMap);
  13. } catch (IOException | SAXException | TikaException e) {
  14. logger.error("Failed to extract metadata using Tika.", e);
  15. return null;
  16. }
  17. }

代码示例来源:origin: com.github.lafa.tikaNoExternal/tika-parsers

  1. private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata,
  2. ParseContext context, EndDocumentShieldingContentHandler handler)
  3. throws IOException, SAXException, TikaException {
  4. if (entry == null) return;
  5. if (entry.getName().equals("mimetype")) {
  6. String type = IOUtils.toString(zip, UTF_8);
  7. metadata.set(Metadata.CONTENT_TYPE, type);
  8. } else if (entry.getName().equals(META_NAME)) {
  9. meta.parse(zip, new DefaultHandler(), metadata, context);
  10. } else if (entry.getName().endsWith("content.xml")) {
  11. if (content instanceof OpenDocumentContentParser) {
  12. ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
  13. } else {
  14. // Foreign content parser was set:
  15. content.parse(zip, handler, metadata, context);
  16. }
  17. } else if (entry.getName().endsWith("styles.xml")) {
  18. if (content instanceof OpenDocumentContentParser) {
  19. ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
  20. } else {
  21. // Foreign content parser was set:
  22. content.parse(zip, handler, metadata, context);
  23. }
  24. }
  25. }
  26. }

代码示例来源:origin: apache/tika

  1. context.set(Parser.class, decorator);
  2. ContentHandler localHandler = parserState.recursiveParserWrapperHandler.getNewContentHandler();
  3. long started = System.currentTimeMillis();
  4. parserState.recursiveParserWrapperHandler.startDocument();
  5. try {
  6. getWrappedParser().parse(stream, localHandler, metadata, context);
  7. } catch (SAXException e) {
  8. boolean wlr = isWriteLimitReached(e);
  9. throw e;
  10. metadata.set(RecursiveParserWrapperHandler.WRITE_LIMIT_REACHED, "true");
  11. } catch (Throwable e) {
  12. metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX+"runtime", stackTrace);
  13. throw e;
  14. } finally {
  15. long elapsedMillis = System.currentTimeMillis() - started;
  16. metadata.set(RecursiveParserWrapperHandler.PARSE_TIME_MILLIS, Long.toString(elapsedMillis));
  17. parserState.recursiveParserWrapperHandler.endDocument(localHandler, metadata);
  18. parserState.recursiveParserWrapperHandler.endDocument();

相关文章