【Lucene3.6.2入门系列】第03节_简述Lucene中常见的搜索功能

x33g5p2x  于2021-12-24 转载在 其他  
字(12.8k)|赞(0)|评价(0)|浏览(444)

完整版见 https://jadyer.github.io/2013/08/18/lucene-search/

  1. package com.jadyer.lucene;
  2. import java.io.File;
  3. import java.io.IOException;
  4. import java.text.SimpleDateFormat;
  5. import java.util.Date;
  6. import org.apache.lucene.analysis.standard.StandardAnalyzer;
  7. import org.apache.lucene.document.Document;
  8. import org.apache.lucene.document.Field;
  9. import org.apache.lucene.document.NumericField;
  10. import org.apache.lucene.index.IndexReader;
  11. import org.apache.lucene.index.IndexWriter;
  12. import org.apache.lucene.index.IndexWriterConfig;
  13. import org.apache.lucene.index.Term;
  14. import org.apache.lucene.queryParser.ParseException;
  15. import org.apache.lucene.queryParser.QueryParser;
  16. import org.apache.lucene.search.BooleanQuery;
  17. import org.apache.lucene.search.FuzzyQuery;
  18. import org.apache.lucene.search.IndexSearcher;
  19. import org.apache.lucene.search.NumericRangeQuery;
  20. import org.apache.lucene.search.PhraseQuery;
  21. import org.apache.lucene.search.PrefixQuery;
  22. import org.apache.lucene.search.Query;
  23. import org.apache.lucene.search.ScoreDoc;
  24. import org.apache.lucene.search.TermQuery;
  25. import org.apache.lucene.search.TermRangeQuery;
  26. import org.apache.lucene.search.TopDocs;
  27. import org.apache.lucene.search.WildcardQuery;
  28. import org.apache.lucene.search.BooleanClause.Occur;
  29. import org.apache.lucene.store.Directory;
  30. import org.apache.lucene.store.FSDirectory;
  31. import org.apache.lucene.util.Version;
  32. /**
  33. * 【Lucene3.6.2入门系列】第03节_简述Lucene中常见的搜索功能
  34. * @create Aug 1, 2013 3:54:27 PM
  35. * @author 玄玉<http://blog.csdn.net/jadyer>
  36. */
  37. public class HelloSearch {
  38. private Directory directory;
  39. private IndexReader reader;
  40. private String[] ids = {"1", "2", "3", "4", "5", "6"};
  41. private String[] names = {"Michael", "Scofield", "Tbag", "Jack", "Jade", "Jadyer"};
  42. private String[] emails = {"aa@jadyer.us", "bb@jadyer.cn", "cc@jadyer.cc", "dd@jadyer.tw", "ee@jadyer.hk", "ff@jadyer.me"};
  43. private String[] contents = {"my java blog is http://blog.csdn.net/jadyer", "my website is http://www.jadyer.cn", "my name is jadyer", "I am JavaDeveloper", "I am from Haerbin", "I like Lucene"};
  44. private int[] attachs = {9,3,5,4,1,2};
  45. private Date[] dates = new Date[ids.length];
  46. public HelloSearch(){
  47. IndexWriter writer = null;
  48. Document doc = null;
  49. SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");
  50. try {
  51. dates[0] = sdf.parse("20120601");
  52. dates[1] = sdf.parse("20120603");
  53. dates[2] = sdf.parse("20120605");
  54. dates[3] = sdf.parse("20120607");
  55. dates[4] = sdf.parse("20120609");
  56. dates[5] = sdf.parse("20120611");
  57. directory = FSDirectory.open(new File("myExample/03_index/"));
  58. writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36)));
  59. writer.deleteAll(); //创建索引之前,先把文档清空掉
  60. for(int i=0; i<ids.length; i++){ //遍历ID来创建文档
  61. doc = new Document();
  62. doc.add(new Field("id", ids[i], Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
  63. doc.add(new Field("name", names[i], Field.Store.YES, Field.Index.ANALYZED_NO_NORMS));
  64. doc.add(new Field("email", emails[i], Field.Store.YES, Field.Index.NOT_ANALYZED));
  65. doc.add(new Field("email", "test"+i+""+i+"@jadyer.com", Field.Store.YES, Field.Index.NOT_ANALYZED));
  66. doc.add(new Field("content", contents[i], Field.Store.NO, Field.Index.ANALYZED));
  67. doc.add(new NumericField("attach", Field.Store.YES, true).setIntValue(attachs[i])); //为数字加索引(第三个参数指定是否索引)
  68. doc.add(new NumericField("attach", Field.Store.YES, true).setIntValue((i+1)*100)); //假设有多个附件
  69. doc.add(new NumericField("date", Field.Store.YES, true).setLongValue(dates[i].getTime())); //为日期加索引
  70. writer.addDocument(doc);
  71. }
  72. } catch (Exception e) {
  73. e.printStackTrace();
  74. } finally {
  75. if(null != writer){
  76. try {
  77. writer.close();
  78. } catch (IOException ce) {
  79. ce.printStackTrace();
  80. }
  81. }
  82. }
  83. }
  84. /**
  85. * 针对分页搜索创建索引
  86. */
  87. public HelloSearch(boolean pageFlag){
  88. String[] myNames = new String[50];
  89. String[] myContents = new String[50];
  90. for(int i=0; i<50; i++){
  91. myNames[i] = "file(" + i + ")";
  92. myContents[i] = "I love JavaSE, also love Lucene(" + i + ")";
  93. }
  94. IndexWriter writer = null;
  95. Document doc = null;
  96. try {
  97. directory = FSDirectory.open(new File("myExample/03_index/"));
  98. writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36)));
  99. writer.deleteAll();
  100. for(int i=0; i<myNames.length; i++){
  101. doc = new Document();
  102. doc.add(new Field("myname", myNames[i], Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
  103. doc.add(new Field("mycontent", myContents[i], Field.Store.YES, Field.Index.ANALYZED));
  104. writer.addDocument(doc);
  105. }
  106. } catch (IOException e) {
  107. e.printStackTrace();
  108. } finally {
  109. if(null != writer){
  110. try {
  111. writer.close();
  112. } catch (IOException ce) {
  113. ce.printStackTrace();
  114. }
  115. }
  116. }
  117. }
  118. /**
  119. * 获取IndexSearcher实例
  120. */
  121. private IndexSearcher getIndexSearcher(){
  122. try {
  123. if(reader == null){
  124. reader = IndexReader.open(directory);
  125. }else{
  126. //if the index was changed since the provided reader was opened, open and return a new reader; else,return null
  127. //如果当前reader在打开期间index发生改变,则打开并返回一个新的IndexReader,否则返回null
  128. IndexReader ir = IndexReader.openIfChanged(reader);
  129. if(ir != null){
  130. reader.close(); //关闭原reader
  131. reader = ir; //赋予新reader
  132. }
  133. }
  134. return new IndexSearcher(reader);
  135. }catch(Exception e) {
  136. e.printStackTrace();
  137. }
  138. return null; //发生异常则返回null
  139. }
  140. /**
  141. * 执行搜索操作
  142. * @param query 搜索的Query对象
  143. */
  144. private void doSearch(Query query){
  145. IndexSearcher searcher = this.getIndexSearcher();
  146. try {
  147. //第二个参数指定搜索后显示的最多的记录数,其与tds.totalHits没有联系
  148. TopDocs tds = searcher.search(query, 10);
  149. System.out.println("本次搜索到[" + tds.totalHits + "]条记录");
  150. for(ScoreDoc sd : tds.scoreDocs){
  151. Document doc = searcher.doc(sd.doc);
  152. System.out.print("文档编号=" + sd.doc + " 文档权值=" + doc.getBoost() + " 文档评分=" + sd.score + " ");
  153. System.out.print("id=" + doc.get("id") + " email=" + doc.get("email") + " name=" + doc.get("name") + " ");
  154. //获取多个同名域的方式
  155. String[] attachValues = doc.getValues("attach");
  156. for(String attach : attachValues){
  157. System.out.print("attach=" + attach + " ");
  158. }
  159. System.out.println();
  160. }
  161. } catch (IOException e) {
  162. e.printStackTrace();
  163. } finally {
  164. if(null != searcher){
  165. try {
  166. searcher.close(); //记得关闭IndexSearcher
  167. } catch (IOException e) {
  168. e.printStackTrace();
  169. }
  170. }
  171. }
  172. }
  173. /**
  174. * 精确匹配搜索
  175. * @param fieldName 域名(相当于表的字段名)
  176. * @param keyWords 搜索的关键字
  177. */
  178. public void searchByTerm(String fieldName, String keyWords){
  179. Query query = new TermQuery(new Term(fieldName, keyWords));
  180. this.doSearch(query);
  181. }
  182. /**
  183. * 基于范围的搜索
  184. * @param fieldName 域名(相当于表的字段名)
  185. * @param start 开始字符
  186. * @param end 结束字符
  187. */
  188. public void searchByTermRange(String fieldName, String start, String end){
  189. Query query = new TermRangeQuery(fieldName, start, end, true, true); //后面两个参数用于指定开区间或闭区间
  190. this.doSearch(query);
  191. }
  192. /**
  193. * 针对数字的搜索
  194. */
  195. public void searchByNumericRange(String fieldName, int min, int max){
  196. Query query = NumericRangeQuery.newIntRange(fieldName, min, max, true, true);
  197. this.doSearch(query);
  198. }
  199. /**
  200. * 基于前缀的搜索
  201. * @see 它是对Field分词后的结果进行前缀查找的结果
  202. */
  203. public void searchByPrefix(String fieldName, String prefix){
  204. Query query = new PrefixQuery(new Term(fieldName, prefix));
  205. this.doSearch(query);
  206. }
  207. /**
  208. * 基于通配符的搜索
  209. * @see *-->任意多个字符
  210. * @see ?-->一个字符
  211. */
  212. public void searchByWildcard(String fieldName, String wildcard){
  213. Query query = new WildcardQuery(new Term(fieldName, wildcard));
  214. this.doSearch(query);
  215. }
  216. /**
  217. * 模糊搜索
  218. * @see 与通配符搜索不同
  219. */
  220. public void searchByFuzzy(String fieldName, String fuzzy){
  221. Query query = new FuzzyQuery(new Term(fieldName, fuzzy));
  222. this.doSearch(query);
  223. }
  224. /**
  225. * 多条件搜索
  226. * @see 本例中搜索name值中以Ja开头,且content中包含am的内容
  227. * @see Occur.MUST------表示此条件必须为true
  228. * @see Occur.MUST_NOT--表示此条件必须为false
  229. * @see Occur.SHOULD----表示此条件非必须
  230. */
  231. public void searchByBoolean(){
  232. BooleanQuery query = new BooleanQuery();
  233. query.add(new WildcardQuery(new Term("name", "Ja*")), Occur.MUST);
  234. query.add(new TermQuery(new Term("content", "am")), Occur.MUST);
  235. this.doSearch(query);
  236. }
  237. /**
  238. * 短语搜索
  239. * @see 很遗憾的是短语查询对中文搜索没有太大的作用,但对英文搜索是很好用的,但它的开销比较大,尽量少用
  240. */
  241. public void searchByPhrase(){
  242. PhraseQuery query = new PhraseQuery();
  243. query.setSlop(1); //设置跳数
  244. query.add(new Term("content", "am")); //第一个Term
  245. query.add(new Term("content", "Haerbin")); //产生距离之后的第二个Term
  246. this.doSearch(query);
  247. }
  248. /**
  249. * 基于QueryParser的搜索
  250. */
  251. public void searchByQueryParse(){
  252. QueryParser parser = new QueryParser(Version.LUCENE_36, "content", new StandardAnalyzer(Version.LUCENE_36));
  253. Query query = null;
  254. try {
  255. // query = parser.parse("Haerbin"); //搜索content中包含[Haerbin]的记录
  256. // query = parser.parse("I AND Haerbin"); //搜索content中包含[I]和[Haerbin]的记录
  257. // query = parser.parse("Lucene OR Haerbin"); //搜索content中包含[Lucene]或者[Haerbin]的记录
  258. // query = parser.parse("Lucene Haerbin"); //搜索content中包含[Lucene]或者[Haerbin]的记录
  259. // parser.setDefaultOperator(Operator.AND); //将空格的默认操作OR修改为AND
  260. // //1)如果name域在索引时,不进行分词,那么无论这里写成[name:Jadyer]还是[name:jadyer],最后得到的都是0条记录
  261. // //2)由于name原值为大写[J],若索引时不对name分词,除非修改name原值为小写[j],并且搜索[name:jadyer]才能得到记录
  262. // query = parser.parse("name:Jadyer"); //修改搜索域为name=Jadyer的记录
  263. // query = parser.parse("name:Ja*"); //支持通配符
  264. // query = parser.parse("\"I am\""); //搜索content中包含[I am]的记录(注意不能使用parse("content:'I am'"))
  265. // parser.setAllowLeadingWildcard(true); //设置允许[*]或[?]出现在查询字符的第一位,即[name:*de],否则[name:*de]会报异常
  266. // query = parser.parse("name:*de"); //Lucene默认的第一个字符不允许为通配符,因为这样效率比较低
  267. // //parse("+am +name:Jade")--------------搜索content中包括[am]的,并且name=Jade的记录
  268. // //parse("am AND NOT name:Jade")--------搜索content中包括[am]的,并且nam不是Jade的记录
  269. // //parse("(blog OR am) AND name:Jade")--搜索content中包括[blog]或者[am]的,并且name=Jade的记录
  270. // query = parser.parse("-name:Jack +I"); //搜索content中包括[I]的,并且name不是Jack的记录(加减号要放到域说明的前面)
  271. // query = parser.parse("id:[1 TO 3]"); //搜索id值从1到3的记录(TO必须大写,且这种方式没有办法匹配数字)
  272. // query = parser.parse("id:{1 TO 3}"); //搜索id=2的记录
  273. query = parser.parse("name:Jadk~"); //模糊搜索
  274. } catch (ParseException e) {
  275. e.printStackTrace();
  276. }
  277. this.doSearch(query);
  278. }
  279. /**
  280. * 普通的分页搜索
  281. * @see 适用于lucene3.5之前
  282. * @param expr 搜索表达式
  283. * @param pageIndex 页码
  284. * @param pageSize 分页大小
  285. */
  286. public void searchPage(String expr, int pageIndex, int pageSize){
  287. IndexSearcher searcher = this.getIndexSearcher();
  288. QueryParser parser = new QueryParser(Version.LUCENE_36, "mycontent", new StandardAnalyzer(Version.LUCENE_36));
  289. try {
  290. Query query = parser.parse(expr);
  291. TopDocs tds = searcher.search(query, pageIndex*pageSize);
  292. ScoreDoc[] sds = tds.scoreDocs;
  293. for(int i=(pageIndex-1)*pageSize; i<pageIndex*pageSize; i++){
  294. Document doc = searcher.doc(sds[i].doc);
  295. System.out.println("文档编号:" + sds[i].doc + "-->" + doc.get("myname") + "-->" + doc.get("mycontent"));
  296. }
  297. } catch (Exception e) {
  298. e.printStackTrace();
  299. } finally {
  300. if(null != searcher){
  301. try {
  302. searcher.close();
  303. } catch (IOException e) {
  304. e.printStackTrace();
  305. }
  306. }
  307. }
  308. }
  309. /**
  310. * 基于searchAfter的分页搜索
  311. * @see 适用于Lucene3.5
  312. * @param expr 搜索表达式
  313. * @param pageIndex 页码
  314. * @param pageSize 分页大小
  315. */
  316. public void searchPageByAfter(String expr, int pageIndex, int pageSize){
  317. IndexSearcher searcher = this.getIndexSearcher();
  318. QueryParser parser = new QueryParser(Version.LUCENE_36, "mycontent", new StandardAnalyzer(Version.LUCENE_36));
  319. try {
  320. Query query = parser.parse(expr);
  321. TopDocs tds = searcher.search(query, (pageIndex-1)*pageSize);
  322. //使用IndexSearcher.searchAfter()搜索,该方法第一个参数为上一页记录中的最后一条记录
  323. if(pageIndex > 1){
  324. tds = searcher.searchAfter(tds.scoreDocs[(pageIndex-1)*pageSize-1], query, pageSize);
  325. }else{
  326. tds = searcher.searchAfter(null, query, pageSize);
  327. }
  328. for(ScoreDoc sd : tds.scoreDocs){
  329. Document doc = searcher.doc(sd.doc);
  330. System.out.println("文档编号:" + sd.doc + "-->" + doc.get("myname") + "-->" + doc.get("mycontent"));
  331. }
  332. } catch (Exception e) {
  333. e.printStackTrace();
  334. } finally {
  335. if(null != searcher){
  336. try {
  337. searcher.close();
  338. } catch (IOException e) {
  339. e.printStackTrace();
  340. }
  341. }
  342. }
  343. }
  344. }

下面是JUnit4.x编写的测试

  1. package com.jadyer.test;
  2. import java.io.File;
  3. import org.junit.Before;
  4. import org.junit.Test;
  5. import com.jadyer.lucene.HelloSearch;
  6. public class HelloSearchTest {
  7. private HelloSearch hello;
  8. @Before
  9. public void init(){
  10. hello = new HelloSearch();
  11. }
  12. @Test
  13. public void searchByTerm(){
  14. hello.searchByTerm("content", "my");
  15. }
  16. @Test
  17. public void searchByTermRange(){
  18. hello.searchByTermRange("name", "M", "o");
  19. }
  20. @Test
  21. public void searchByNumericRange(){
  22. hello.searchByNumericRange("attach", 2, 5);
  23. }
  24. @Test
  25. public void searchByPrefix(){
  26. hello.searchByPrefix("content", "b");
  27. }
  28. @Test
  29. public void searchByWildcard(){
  30. hello.searchByWildcard("name", "Ja??er");
  31. }
  32. @Test
  33. public void searchByFuzzy(){
  34. hello.searchByFuzzy("name", "Jadk");
  35. }
  36. @Test
  37. public void searchByBoolean(){
  38. hello.searchByBoolean();
  39. }
  40. @Test
  41. public void searchByPhrase(){
  42. hello.searchByPhrase();
  43. }
  44. @Test
  45. public void searchByQueryParse(){
  46. hello.searchByQueryParse();
  47. }
  48. @Test
  49. public void searchPage(){
  50. for(File file : new File("myExample/03_index/").listFiles()){
  51. file.delete();
  52. }
  53. hello = new HelloSearch(true);
  54. hello.searchPage("mycontent:javase", 2, 10);
  55. }
  56. @Test
  57. public void searchPageByAfter(){
  58. for(File file : new File("myExample/03_index/").listFiles()){
  59. file.delete();
  60. }
  61. hello = new HelloSearch(true);
  62. hello.searchPageByAfter("mycontent:javase", 3, 10);
  63. }
  64. }

相关文章