【Lucene3.6.2入门系列】第02节_针对索引文件的CRUD

x33g5p2x  于2021-12-24 转载在 其他  
字(9.7k)|赞(0)|评价(0)|浏览(429)

完整版见 https://jadyer.github.io/2013/08/18/lucene-index/

  1. package com.jadyer.lucene;
  2. import java.io.File;
  3. import java.io.IOException;
  4. import java.text.SimpleDateFormat;
  5. import java.util.Date;
  6. import org.apache.lucene.analysis.standard.StandardAnalyzer;
  7. import org.apache.lucene.document.Document;
  8. import org.apache.lucene.document.Field;
  9. import org.apache.lucene.document.NumericField;
  10. import org.apache.lucene.index.IndexReader;
  11. import org.apache.lucene.index.IndexWriter;
  12. import org.apache.lucene.index.IndexWriterConfig;
  13. import org.apache.lucene.index.Term;
  14. import org.apache.lucene.search.IndexSearcher;
  15. import org.apache.lucene.search.Query;
  16. import org.apache.lucene.search.ScoreDoc;
  17. import org.apache.lucene.search.TermQuery;
  18. import org.apache.lucene.search.TopDocs;
  19. import org.apache.lucene.store.Directory;
  20. import org.apache.lucene.store.FSDirectory;
  21. import org.apache.lucene.util.Version;
  22. /**
  23. * 【Lucene3.6.2入门系列】第02节_针对索引文件的CRUD
  24. * @see =============================================================================================================
  25. * @see Lucene官网:http://lucene.apache.org
  26. * @see Lucene下载:http://archive.apache.org/dist/lucene/java/
  27. * @see Lucene文档:http://wiki.apache.org/lucene-java/
  28. * @see =============================================================================================================
  29. * @see 使用Luke查看分词信息(http://code.google.com/p/luke/)
  30. * @see 1)引言:每一个Lucene版本都会有一个相应的Luke文件
  31. * @see 2)打开:双击或java -jar lukeall-3.5.0.jar
  32. * @see 3)选择索引的存放目录后点击OK即可
  33. * @see 7)如果我们的索引有改变,可以点击右侧的Re-open按钮重新载入索引
  34. * @see 4)Luke界面右下角的Top ranking terms窗口中显示的就是分词信息。其中Rank列表示出现频率
  35. * @see 5)Luke菜单下的Documents选项卡中显示的就是文档信息,我们可以根据文档序号来浏览(点击向左和向右的方向箭头)
  36. * @see 6)Luke菜单下的Search选项卡中可以根据我们输入的表达式来查文档内容
  37. * @see 比如在Enter search expression here:输入content:my,再在右侧点击一个黑色粗体字的Search大按钮即可
  38. * @see =============================================================================================================
  39. * @create Jun 30, 2012 4:34:09 PM
  40. * @author 玄玉<http://blog.csdn.net/jadyer>
  41. */
  42. public class HelloIndex {
  43. /*
  44. * 定义一组数据,用来演示搜索(这里有一封邮件为例)
  45. * 假设每一个变量代表一个Document,这里就定义了6个Document
  46. */
  47. //邮件编号
  48. private String[] ids = {"1", "2", "3", "4", "5", "6"};
  49. //邮件主题
  50. private String[] names = {"Michael", "Scofield", "Tbag", "Jack", "Jade", "Jadyer"};
  51. //邮件地址
  52. private String[] emails = {"aa@jadyer.us", "bb@jadyer.cn", "cc@jadyer.cc", "dd@jadyer.tw", "ee@jadyer.hk", "ff@jadyer.me"};
  53. //邮件内容
  54. private String[] contents = {"my blog", "my website", "my name", "I am JavaDeveloper", "I am from Haerbin", "I like Lucene"};
  55. //邮件附件(为数字和日期加索引,与,字符串加索引的方式不同)
  56. private int[] attachs = {9,3,5,4,1,2};
  57. //邮件日期
  58. private Date[] dates = new Date[ids.length];
  59. //它的创建是比较耗时耗资源的,所以这里只让它创建一次,此时reader处于整个生命周期中,实际应用中也可能直接放到ApplicationContext里面
  60. private static IndexReader reader = null;
  61. private Directory directory = null;
  62. public HelloIndex(){
  63. SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");
  64. try {
  65. dates[0] = sdf.parse("20120601");
  66. dates[1] = sdf.parse("20120603");
  67. dates[2] = sdf.parse("20120605");
  68. dates[3] = sdf.parse("20120607");
  69. dates[4] = sdf.parse("20120609");
  70. dates[5] = sdf.parse("20120611");
  71. directory = FSDirectory.open(new File("myExample/02_index/"));
  72. } catch (Exception e) {
  73. e.printStackTrace();
  74. }
  75. }
  76. /**
  77. * 获取IndexReader实例
  78. */
  79. private IndexReader getIndexReader(){
  80. try {
  81. if(reader == null){
  82. reader = IndexReader.open(directory);
  83. }else{
  84. //if the index was changed since the provided reader was opened, open and return a new reader; else,return null
  85. //如果当前reader在打开期间index发生改变,则打开并返回一个新的IndexReader,否则返回null
  86. IndexReader ir = IndexReader.openIfChanged(reader);
  87. if(ir != null){
  88. reader.close(); //关闭原reader
  89. reader = ir; //赋予新reader
  90. }
  91. }
  92. return reader;
  93. }catch(Exception e) {
  94. e.printStackTrace();
  95. }
  96. return null; //发生异常则返回null
  97. }
  98. /**
  99. * 通过IndexReader获取文档数量
  100. */
  101. public void getDocsCount(){
  102. System.out.println("maxDocs:" + this.getIndexReader().maxDoc());
  103. System.out.println("numDocs:" + this.getIndexReader().numDocs());
  104. System.out.println("deletedDocs:" + this.getIndexReader().numDeletedDocs());
  105. }
  106. /**
  107. * 创建索引
  108. */
  109. public void createIndex(){
  110. IndexWriter writer = null;
  111. Document doc = null;
  112. try{
  113. writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36)));
  114. writer.deleteAll(); //创建索引之前,先把文档清空掉
  115. for(int i=0; i<ids.length; i++){ //遍历ID来创建文档
  116. doc = new Document();
  117. doc.add(new Field("id", ids[i], Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
  118. doc.add(new Field("name", names[i], Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
  119. doc.add(new Field("email", emails[i], Field.Store.YES, Field.Index.NOT_ANALYZED));
  120. doc.add(new Field("content", contents[i], Field.Store.NO, Field.Index.ANALYZED));
  121. doc.add(new NumericField("attach", Field.Store.YES, true).setIntValue(attachs[i])); //为数字加索引(第三个参数指定是否索引)
  122. doc.add(new NumericField("date", Field.Store.YES, true).setLongValue(dates[i].getTime())); //为日期加索引
  123. /*
  124. * 建立索引时加权
  125. * 定义排名规则,即加权,这里是为指定邮件名结尾的emails加权
  126. */
  127. if(emails[i].endsWith("jadyer.cn")){
  128. doc.setBoost(2.0f);
  129. }else if(emails[i].endsWith("jadyer.me")){
  130. doc.setBoost(1.5f); //为文档加权....默认为1.0,权值越高则排名越高,显示得就越靠前
  131. }else{
  132. doc.setBoost(0.5f); //注意它的参数类型是Float
  133. }
  134. writer.addDocument(doc);
  135. }
  136. }catch(Exception e) {
  137. e.printStackTrace();
  138. }finally{
  139. if(null != writer){
  140. try {
  141. writer.close();
  142. } catch (IOException ce) {
  143. ce.printStackTrace();
  144. }
  145. }
  146. }
  147. }
  148. /**
  149. * 搜索文件
  150. */
  151. public void searchFile(){
  152. IndexSearcher searcher = new IndexSearcher(this.getIndexReader());
  153. Query query = new TermQuery(new Term("content", "my")); //精确搜索:搜索"content"中包含"my"的文档
  154. try{
  155. TopDocs tds = searcher.search(query, 10);
  156. for(ScoreDoc sd : tds.scoreDocs){
  157. Document doc = searcher.doc(sd.doc); //sd.doc得到的是文档的序号
  158. //doc.getBoost()得到的权值与创建索引时设置的权值之间是不相搭的,创建索引时的权值的查看需要使用Luke工具
  159. // 之所以这样,是因为这里的Document对象(是获取到的)与创建索引时的Document对象,不是同一个对象
  160. //sd.score得到的是该文档的评分,该评分规则的公式是比较复杂的,它主要与文档的权值和出现次数成正比
  161. System.out.print("(" + sd.doc + "|" + doc.getBoost() + "|" + sd.score + ")" + doc.get("name") + "[" + doc.get("email") + "]-->");
  162. System.out.println(doc.get("id") + "," + doc.get("attach") + "," + new SimpleDateFormat("yyyyMMdd").format(new Date(Long.parseLong(doc.get("date")))));
  163. }
  164. }catch(Exception e){
  165. e.printStackTrace();
  166. }finally{
  167. if(null != searcher){
  168. try {
  169. searcher.close();
  170. } catch (IOException e) {
  171. e.printStackTrace();
  172. }
  173. }
  174. }
  175. }
  176. /**
  177. * 更新索引
  178. * @see Lucene其实并未提供更新索引的方法,这里的更新操作内部是先删除再添加的方式
  179. * @see 因为Lucene认为更新索引的代价,与删除后重建索引的代价,二者是差不多的
  180. */
  181. public void updateIndex(){
  182. IndexWriter writer = null;
  183. Document doc = new Document();
  184. try{
  185. writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36)));
  186. doc.add(new Field("id", "1111", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
  187. doc.add(new Field("name", names[0], Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
  188. doc.add(new Field("email", emails[0], Field.Store.YES, Field.Index.NOT_ANALYZED));
  189. doc.add(new Field("content", contents[0], Field.Store.NO, Field.Index.ANALYZED));
  190. doc.add(new NumericField("attach", Field.Store.YES, true).setIntValue(attachs[0]));
  191. doc.add(new NumericField("date", Field.Store.YES, true).setLongValue(dates[0].getTime()));
  192. //其实它会先删除索引文档中id为1的文档,然后再将这里的doc对象重新索引,所以即便这里的1!=1111,但它并不会报错
  193. //所以在执行完该方法后:maxDocs=7,numDocs=6,deletedDocs=1,就是因为Lucene会先删除再添加
  194. writer.updateDocument(new Term("id","1"), doc);
  195. }catch(Exception e) {
  196. e.printStackTrace();
  197. }finally{
  198. if(null != writer){
  199. try {
  200. writer.close();
  201. } catch (IOException ce) {
  202. ce.printStackTrace();
  203. }
  204. }
  205. }
  206. }
  207. /**
  208. * 删除索引
  209. * @see -----------------------------------------------------------------------------------------------------
  210. * @see 在执行完该方法后,再执行本类的searchFile()方法,得知numDocs=5,maxDocs=6,deletedDocs=1
  211. * @see 这说明此时删除的文档并没有被完全删除,而是存储在一个回收站中,它是可以恢复的
  212. * @see -----------------------------------------------------------------------------------------------------
  213. * @see 从回收站中清空索引IndexWriter
  214. * @see 对于清空索引,Lucene3.5之前叫做优化,调用的是IndexWriter.optimize()方法,但该方法已被禁用
  215. * @see 因为optimize时它会全部更新索引,这一过程所涉及到的负载是很大的,于是弃用了该方法,使用forceMerge代替
  216. * @see 使用IndexWriter.forceMergeDeletes()方法可以强制清空回收站中的内容
  217. * @see 另外IndexWriter.forceMerge(3)方法会将索引合并为3段,这3段中的被删除的数据也会被清空
  218. * @see 但其在Lucene3.5之后不建议使用,因为其会消耗大量的开销,而Lucene会根据情况自动处理的
  219. * @see -----------------------------------------------------------------------------------------------------
  220. */
  221. public void deleteIndex(){
  222. IndexWriter writer = null;
  223. try{
  224. writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36)));
  225. //其参数可以传Query或Term....Query指的是可以查询出一系列的结果并将其全部删掉,而Term属于精确查找
  226. writer.deleteDocuments(new Term("id", "1")); //删除索引文档中id为1的文档
  227. }catch(Exception e) {
  228. e.printStackTrace();
  229. }finally{
  230. if(null != writer){
  231. try {
  232. writer.close();
  233. } catch (IOException ce) {
  234. ce.printStackTrace();
  235. }
  236. }
  237. }
  238. }
  239. /**
  240. * 恢复索引
  241. * @see 建议弃用
  242. */
  243. @Deprecated
  244. public void unDeleteIndex(){
  245. IndexReader reader = null;
  246. try {
  247. //IndexReader.open(directory)此时该IndexReader默认的readOnly=true,而在恢复索引时应该指定其为非只读的
  248. reader = IndexReader.open(directory, false);
  249. //Deprecated. Write support will be removed in Lucene 4.0. There will be no replacement for this method.
  250. reader.undeleteAll();
  251. } catch (Exception e) {
  252. e.printStackTrace();
  253. }finally{
  254. if(null != reader){
  255. try {
  256. reader.close();
  257. } catch (IOException e) {
  258. e.printStackTrace();
  259. }
  260. }
  261. }
  262. }
  263. }

下面是用JUnit4.x写的一个小测试

  1. package com.jadyer.test;
  2. import org.junit.After;
  3. import org.junit.Before;
  4. import org.junit.Test;
  5. import com.jadyer.lucene.HelloIndex;
  6. public class HelloIndexTest {
  7. private HelloIndex hello;
  8. @Before
  9. public void init(){
  10. hello = new HelloIndex();
  11. }
  12. @After
  13. public void destroy(){
  14. hello.getDocsCount();
  15. }
  16. @Test
  17. public void createIndex(){
  18. hello.createIndex();
  19. }
  20. @Test
  21. public void searchFile(){
  22. hello.searchFile();
  23. }
  24. @Test
  25. public void updateIndex(){
  26. hello.updateIndex();
  27. }
  28. @Test
  29. public void deleteIndex(){
  30. hello.deleteIndex();
  31. }
  32. @Test
  33. @SuppressWarnings("deprecation")
  34. public void unDeleteIndex(){
  35. hello.unDeleteIndex();
  36. }
  37. }

相关文章