【Lucene3.6.2入门系列】第11节_高亮

x33g5p2x  于2021-12-24 转载在 其他  
字(5.4k)|赞(0)|评价(0)|浏览(412)

完整版见 https://jadyer.github.io/2013/08/20/lucene-highlighter/

  1. package com.jadyer.lucene;
  2. import java.io.File;
  3. import java.io.IOException;
  4. import org.apache.lucene.analysis.Analyzer;
  5. import org.apache.lucene.document.Document;
  6. import org.apache.lucene.document.Field;
  7. import org.apache.lucene.index.IndexReader;
  8. import org.apache.lucene.index.IndexWriter;
  9. import org.apache.lucene.index.IndexWriterConfig;
  10. import org.apache.lucene.queryParser.MultiFieldQueryParser;
  11. import org.apache.lucene.queryParser.QueryParser;
  12. import org.apache.lucene.search.IndexSearcher;
  13. import org.apache.lucene.search.Query;
  14. import org.apache.lucene.search.ScoreDoc;
  15. import org.apache.lucene.search.TopDocs;
  16. import org.apache.lucene.search.highlight.Formatter;
  17. import org.apache.lucene.search.highlight.Fragmenter;
  18. import org.apache.lucene.search.highlight.Highlighter;
  19. import org.apache.lucene.search.highlight.QueryScorer;
  20. import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
  21. import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
  22. import org.apache.lucene.store.Directory;
  23. import org.apache.lucene.store.FSDirectory;
  24. import org.apache.lucene.util.Version;
  25. import org.apache.tika.Tika;
  26. import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;
  27. /**
  28. * 【Lucene3.6.2入门系列】第11节_高亮
  29. * @see 高亮功能属于Lucene的扩展功能(或者叫做贡献功能)
  30. * @see 其所需jar位于Lucene-3.6.2.zip中的/contrib/highlighter/文件夹中
  31. * @see 本例中需要以下4个jar
  32. * @see lucene-core-3.6.2.jar
  33. * @see lucene-highlighter-3.6.2.jar
  34. * @see mmseg4j-all-1.8.5-with-dic.jar
  35. * @see tika-app-1.4.jar
  36. * @create Aug 7, 2013 11:37:10 AM
  37. * @author 玄玉<http://blog.csdn.net/jadyer>
  38. */
  39. public class HelloHighLighter {
  40. private Directory directory;
  41. private IndexReader reader;
  42. public HelloHighLighter(){
  43. Document doc = null;
  44. IndexWriter writer = null;
  45. try{
  46. directory = FSDirectory.open(new File("myExample/myIndex/"));
  47. writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_36, new MMSegAnalyzer()));
  48. writer.deleteAll();
  49. for(File myFile : new File("myExample/myFile/").listFiles()){
  50. doc = new Document();
  51. doc.add(new Field("filecontent", new Tika().parse(myFile))); //Field.Store.NO,Field.Index.ANALYZED
  52. doc.add(new Field("filepath", myFile.getAbsolutePath(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
  53. writer.addDocument(doc);
  54. }
  55. }catch(Exception e) {
  56. e.printStackTrace();
  57. }finally{
  58. if(null != writer){
  59. try {
  60. writer.close();
  61. } catch (IOException ce) {
  62. ce.printStackTrace();
  63. }
  64. }
  65. }
  66. }
  67. /**
  68. * 获取IndexSearcher实例
  69. */
  70. private IndexSearcher getIndexSearcher(){
  71. try {
  72. if(reader == null){
  73. reader = IndexReader.open(directory);
  74. }else{
  75. //if the index was changed since the provided reader was opened, open and return a new reader; else,return null
  76. //如果当前reader在打开期间index发生改变,则打开并返回一个新的IndexReader,否则返回null
  77. IndexReader ir = IndexReader.openIfChanged(reader);
  78. if(ir != null){
  79. reader.close(); //关闭原reader
  80. reader = ir; //赋予新reader
  81. }
  82. }
  83. return new IndexSearcher(reader);
  84. }catch(Exception e) {
  85. e.printStackTrace();
  86. }
  87. return null; //发生异常则返回null
  88. }
  89. /**
  90. * 高亮搜索
  91. * @see 高亮搜索时,不建议把高亮信息存到索引里,而是搜索到内容之后再进行高亮处理
  92. * @see 这里用的是MMSeg4j中文分词器,有关其介绍详见http://blog.csdn.net/jadyer/article/details/10049525
  93. * @param expr 搜索表达式
  94. */
  95. public void searchByHignLighter(String expr){
  96. Analyzer analyzer = new MMSegAnalyzer();
  97. IndexSearcher searcher = this.getIndexSearcher();
  98. //搜索多个Field
  99. QueryParser parser = new MultiFieldQueryParser(Version.LUCENE_36, new String[]{"filepath", "filecontent"}, analyzer);
  100. try {
  101. Query query = parser.parse(expr);
  102. TopDocs tds = searcher.search(query, 50);
  103. for(ScoreDoc sd : tds.scoreDocs){
  104. Document doc = searcher.doc(sd.doc);
  105. //获取文档内容
  106. String filecontent = new Tika().parseToString(new File(doc.get("filepath")));
  107. System.out.println("搜索到的内容为[" + filecontent + "]");
  108. //开始高亮处理
  109. QueryScorer queryScorer = new QueryScorer(query);
  110. Fragmenter fragmenter = new SimpleSpanFragmenter(queryScorer, filecontent.length());
  111. Formatter formatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>");
  112. Highlighter hl = new Highlighter(formatter, queryScorer);
  113. hl.setTextFragmenter(fragmenter);
  114. System.out.println("高亮后的内容为[" + hl.getBestFragment(analyzer, "filecontent", filecontent) + "]");
  115. }
  116. } catch (Exception e) {
  117. e.printStackTrace();
  118. } finally {
  119. if(null != searcher){
  120. try {
  121. searcher.close(); //记得关闭IndexSearcher
  122. } catch (IOException e) {
  123. e.printStackTrace();
  124. }
  125. }
  126. }
  127. }
  128. /**
  129. * 高亮的使用方式
  130. * @see 这里用的是MMSeg4j中文分词器,有关其介绍详见http://blog.csdn.net/jadyer/article/details/10049525
  131. */
  132. private static void testHighLighter(){
  133. String fieldName = "myinfo"; //这个可以随便写,就是起个标识的作用
  134. String text = "我来自中国黑龙江省哈尔滨市巴彦县兴隆镇长春乡民权村4队";
  135. QueryParser parser = new QueryParser(Version.LUCENE_36, fieldName, new MMSegAnalyzer());
  136. try {
  137. //MMSeg4j的new MMSegAnalyzer()默认只会对'中国'和'兴隆'进行分词,所以这里就只高亮它们俩了
  138. Query query = parser.parse("中国 兴隆");
  139. //针对查询出来的文本,查询其评分,以便于能够根据评分决定显示情况
  140. QueryScorer queryScorer = new QueryScorer(query);
  141. //对字符串或文本进行分段,SimpleSpanFragmenter构造方法的第二个参数可以指定高亮的文本长度,默认为100
  142. Fragmenter fragmenter = new SimpleSpanFragmenter(queryScorer);
  143. //高亮时的高亮格式,默认为<B></B>,这里指定为红色字体
  144. Formatter formatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>");
  145. //Highlighter专门用来做高亮显示
  146. //该构造方法还有一个参数为Encoder,它有两个实现类DefaultEncoder和SimpleHTMLEncoder
  147. //SimpleHTMLEncoder可以忽略掉HTML标签,而DefaultEncoder则不会忽略HTML标签
  148. Highlighter hl = new Highlighter(formatter, queryScorer);
  149. hl.setTextFragmenter(fragmenter);
  150. System.out.println(hl.getBestFragment(new MMSegAnalyzer(), fieldName, text));
  151. } catch (Exception e) {
  152. e.printStackTrace();
  153. }
  154. }
  155. /**
  156. * 小测试一下
  157. */
  158. public static void main(String[] args) {
  159. //测试高亮的基本使用效果
  160. HelloHighLighter.testHighLighter();
  161. //测试高亮搜索的效果(测试前记得在myExample/myFile/文件夹中准备一个或多个内容包含"依赖"的doc或pdf的等文件)
  162. new HelloHighLighter().searchByHignLighter("依赖");
  163. }
  164. }

相关文章