【Lucene3.6.2入门系列】第07节_高级搜索之普通Filter和自定义Filter

x33g5p2x  于2021-12-24 转载在 其他  
字(7.4k)|赞(0)|评价(0)|浏览(428)

完整版见https://jadyer.github.io/2013/08/19/lucene-advanced-search-filter/

  1. package com.jadyer.lucene;
  2. import java.io.File;
  3. import java.io.IOException;
  4. import java.text.ParseException;
  5. import java.text.SimpleDateFormat;
  6. import java.util.Date;
  7. import org.apache.lucene.analysis.standard.StandardAnalyzer;
  8. import org.apache.lucene.document.Document;
  9. import org.apache.lucene.document.Field;
  10. import org.apache.lucene.document.NumericField;
  11. import org.apache.lucene.index.IndexReader;
  12. import org.apache.lucene.index.IndexWriter;
  13. import org.apache.lucene.index.IndexWriterConfig;
  14. import org.apache.lucene.queryParser.QueryParser;
  15. import org.apache.lucene.search.Filter;
  16. import org.apache.lucene.search.IndexSearcher;
  17. import org.apache.lucene.search.ScoreDoc;
  18. import org.apache.lucene.search.TopDocs;
  19. import org.apache.lucene.store.Directory;
  20. import org.apache.lucene.store.FSDirectory;
  21. import org.apache.lucene.util.Version;
  22. import com.jadyer.custom.MyFilter;
  23. /**
  24. * 【Lucene3.6.2入门系列】第07节_高级搜索之普通Filter和自定义Filter
  25. * @create Aug 19, 2013 11:13:40 AM
  26. * @author 玄玉<http://blog.csdn.net/jadyer>
  27. */
  28. public class AdvancedSearchByFilter {
  29. private Directory directory;
  30. private IndexReader reader;
  31. public AdvancedSearchByFilter(){
  32. /**文件大小*/
  33. int[] sizes = {90, 10, 20, 10, 60, 50};
  34. /**文件名*/
  35. String[] names = {"Michael.java", "Scofield.ini", "Tbag.txt", "Jack", "Jade", "Jadyer"};
  36. /**文件内容*/
  37. String[] contents = {"my java blog is http://blog.csdn.net/jadyer",
  38. "my Java Website is http://www.jadyer.cn",
  39. "my name is jadyer",
  40. "I am a Java Developer",
  41. "I am from Haerbin",
  42. "I like java of Lucene"};
  43. /**文件日期*/
  44. Date[] dates = new Date[sizes.length];
  45. SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd HH:mm:ss");
  46. IndexWriter writer = null;
  47. Document doc = null;
  48. try {
  49. dates[0] = sdf.parse("20130407 15:25:30");
  50. dates[1] = sdf.parse("20130407 16:30:45");
  51. dates[2] = sdf.parse("20130213 11:15:25");
  52. dates[3] = sdf.parse("20130808 09:30:55");
  53. dates[4] = sdf.parse("20130526 13:54:22");
  54. dates[5] = sdf.parse("20130701 17:35:34");
  55. directory = FSDirectory.open(new File("myExample/01_index/"));
  56. writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36)));
  57. writer.deleteAll();
  58. for(int i=0; i<sizes.length; i++){
  59. doc = new Document();
  60. doc.add(new NumericField("size",Field.Store.YES, true).setIntValue(sizes[i]));
  61. doc.add(new Field("name", names[i], Field.Store.YES, Field.Index.ANALYZED_NO_NORMS));
  62. doc.add(new Field("content", contents[i], Field.Store.NO, Field.Index.ANALYZED));
  63. doc.add(new NumericField("date", Field.Store.YES, true).setLongValue(dates[i].getTime()));
  64. //为每个文档添加一个fileID(与ScoreDoc.doc不同),专门在自定义Filter时使用
  65. doc.add(new Field("fileID", String.valueOf(i), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
  66. writer.addDocument(doc);
  67. }
  68. } catch (Exception e) {
  69. e.printStackTrace();
  70. } finally {
  71. if(null != writer){
  72. try {
  73. writer.close();
  74. } catch (IOException ce) {
  75. ce.printStackTrace();
  76. }
  77. }
  78. }
  79. }
  80. /**
  81. * 获取IndexReader实例
  82. */
  83. private IndexReader getIndexReader(){
  84. try {
  85. if(reader == null){
  86. reader = IndexReader.open(directory);
  87. }else{
  88. //if the index was changed since the provided reader was opened, open and return a new reader; else,return null
  89. //如果当前reader在打开期间index发生改变,则打开并返回一个新的IndexReader,否则返回null
  90. IndexReader ir = IndexReader.openIfChanged(reader);
  91. if(ir != null){
  92. reader.close(); //关闭原reader
  93. reader = ir; //赋予新reader
  94. }
  95. }
  96. return reader;
  97. }catch(Exception e) {
  98. e.printStackTrace();
  99. }
  100. return null; //发生异常则返回null
  101. }
  102. /**
  103. * 搜索过滤
  104. */
  105. public void searchByFilter(String expr, Filter filter){
  106. IndexSearcher searcher = new IndexSearcher(this.getIndexReader());
  107. QueryParser parser = new QueryParser(Version.LUCENE_36, "content", new StandardAnalyzer(Version.LUCENE_36));
  108. TopDocs tds = null;
  109. try {
  110. if(null == filter){
  111. tds = searcher.search(parser.parse(expr), 10);
  112. }else{
  113. tds = searcher.search(parser.parse(expr), filter, 10);
  114. }
  115. for(ScoreDoc sd : tds.scoreDocs){
  116. Document doc = searcher.doc(sd.doc);
  117. System.out.print("文档编号=" + sd.doc + " 文档权值=" + doc.getBoost() + " 文档评分=" + sd.score + " ");
  118. System.out.println("fileID=" + doc.get("fileID") + " size=" + doc.get("size") + " date=" + new SimpleDateFormat("yyyyMMdd HH:mm:ss").format(new Date(Long.parseLong(doc.get("date")))) + " name=" + doc.get("name"));
  119. }
  120. } catch (Exception e) {
  121. e.printStackTrace();
  122. } finally {
  123. if(searcher != null){
  124. try {
  125. searcher.close();
  126. } catch (IOException e) {
  127. e.printStackTrace();
  128. }
  129. }
  130. }
  131. }
  132. /**
  133. * 测试一下过滤效果
  134. */
  135. public static void main(String[] args) throws ParseException {
  136. AdvancedSearchByFilter advancedSearch = new AdvancedSearchByFilter();
  137. // //过滤文件名首字母从'h'到'n'的记录(注意hn要小写)
  138. // advancedSearch.searchByFilter("Java", new TermRangeFilter("name", "h", "n", true, true));
  139. // //过滤文件大小在30到80以内的记录
  140. // advancedSearch.searchByFilter("Java", NumericRangeFilter.newIntRange("size", 30, 80, true, true));
  141. // //过滤文件日期在20130701 00:00:00到20130808 23:59:59之间的记录
  142. // Long min = Long.valueOf(new SimpleDateFormat("yyyyMMdd").parse("20130701").getTime());
  143. // Long max = Long.valueOf(new SimpleDateFormat("yyyyMMdd HH:mm:ss").parse("20130808 23:59:59").getTime());
  144. // advancedSearch.searchByFilter("Java", NumericRangeFilter.newLongRange("date", min, max, true, true));
  145. // //过滤文件名以'ja'打头的(注意ja要小写)
  146. // advancedSearch.searchByFilter("Java", new QueryWrapperFilter(new WildcardQuery(new Term("name", "ja*"))));
  147. //自定义Filter
  148. advancedSearch.searchByFilter("Java", new MyFilter());
  149. }
  150. }

下面是自定义的MyFilter.java

  1. package com.jadyer.custom;
  2. import java.io.IOException;
  3. import org.apache.lucene.index.IndexReader;
  4. import org.apache.lucene.index.Term;
  5. import org.apache.lucene.index.TermDocs;
  6. import org.apache.lucene.search.DocIdSet;
  7. import org.apache.lucene.search.Filter;
  8. import org.apache.lucene.util.OpenBitSet;
  9. /**
  10. * 自定义Filter
  11. * @see ------------------------------------------------------------------------------------------
  12. * @see 本例的应用场景
  13. * @see 假设很多的数据,然后删除了其中的某几条数据,此时在接受搜索请求时为保证不会搜索到已删除的数据
  14. * @see 那么可以更新索引,但更新索引会消耗很多时间(因为数据量大),而又要保证已删除的数据不会被搜索到
  15. * @see 此时就可以自定义Filter,原理即搜索过程中,当发现此记录为已删除记录,则不添加到返回的搜索结果集中
  16. * @see ------------------------------------------------------------------------------------------
  17. * @see 自定义Filter步骤如下
  18. * @see 1)继承Filter类并重写getDocIdSet()方法
  19. * @see 2)根据实际过滤要求返回新的DocIdSet对象
  20. * @see ------------------------------------------------------------------------------------------
  21. * @see DocIdSet小解
  22. * @see 这里Filter干的活其实就是创建一个DocIdSet,而DocIdSet其实就是一个数组,可以理解为其中只存放0或1的值
  23. * @see 每个搜索出来的Document都有一个文档编号,所以搜索出来多少个Document,那么DocIdSet中就会有多少条记录
  24. * @see 而DocIdSet中每一条记录的索引号与文档编号是一一对应的
  25. * @see 所以当DocIdSet中的记录为1时,则对应文档编号的Document就会被添加到TopDocs中,为0就会被过滤掉
  26. * @see ------------------------------------------------------------------------------------------
  27. * @create Aug 6, 2013 7:28:53 PM
  28. * @author 玄玉<http://blog.csdn.net/jadyer>
  29. */
  30. public class MyFilter extends Filter {
  31. private static final long serialVersionUID = -8955061358165068L;
  32. //假设这是已删除记录的fileID值的集合
  33. private String[] deleteFileIDs = {"1", "3"};
  34. @Override
  35. public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
  36. //创建一个DocIdSet的子类OpenBitSet(创建之后默认所有元素都是0),传的参数就是本次"搜索到的"元素数目
  37. OpenBitSet obs = new OpenBitSet(reader.maxDoc());
  38. //先把元素填满,即全部设置为1
  39. obs.set(0, reader.maxDoc());
  40. //用于保存已删除元素的文档编号
  41. int[] docs = new int[1];
  42. for(String deleteDataID : deleteFileIDs){
  43. //获取已删除元素对应的TermDocs
  44. TermDocs tds = reader.termDocs(new Term("fileID", deleteDataID));
  45. //将已删除元素的文档编号放到docs中,将其出现的频率放到freqs中,最后返回查询出来的元素数目
  46. int count = tds.read(docs, new int[1]);
  47. if(count == 1){
  48. //将这个位置docs[0]的元素删除
  49. obs.clear(docs[0]);
  50. }
  51. }
  52. return obs;
  53. }
  54. }

相关文章