使用lucene 4.0版本的全文檢索
所需要的jar包
網速太慢,下次有空再把jar傳上來
1.FileIndex 建立索引,查詢,刪除,更新
package com.strongit.tool.retrieval; import java.io.File; import java.net.MalformedURLException; import java.util.ArrayList; import java.util.Date; import java.util.List; import jcifs.smb.SmbException; import jcifs.smb.SmbFile; import jcifs.smb.SmbFileFilter; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.LongField; import org.apache.lucene.document.TextField; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import com.strongit.util.BaseinfoConfigurer; public class FileIndex { private static String INDEX_DIR = "D:\\index"; private static Analyzer analyzer = null; private static Directory directory = null; private static IndexWriter indexWriter = null; private static String content = ""; public static void main(String[] args) { try { // createIndex();//創建索引 // search("測試"); // insert();//新增索引,不刪除之前的 // delete("1470817624520"); // update(); } catch (Exception e) { e.printStackTrace(); } } /** * 刪除索引 * * @param @param str 刪除的關鍵字 建立索引時的id * @param @throws Exception * @author wusongxiao * @date 2016年8月10日 */ public static void delete(String str) throws Exception { Date date1 = new Date(); analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); directory = FSDirectory.open(new File(INDEX_DIR)); IndexWriterConfig config = new IndexWriterConfig( Version.LUCENE_CURRENT, analyzer); indexWriter = new IndexWriter(directory, config); // indexWriter.deleteDocuments(new Term("filename",str)); indexWriter.deleteDocuments(new Term("id", str)); // 建立索引時 給這個索引賦一個id indexWriter.close(); Date date2 = new Date(); System.out.println("刪除索引耗時:" + (date2.getTime() - date1.getTime()) + "ms\n"); } /** * 新增加索引,不覆蓋之前的 * * @Description: TODO * @param @throws Exception * @return void * @throws * @author wusongxiao * @date 2016年8月10日 */ public static void insert(List listname) throws Exception { // String path = "smb://admini:[email protected]/resource/Teaching/test001.txt"; for(int j =0;j<listname.size();j++){ String path= listname.get(j); //文件地址 SmbFile folder = new SmbFile(path); List<SmbFile> fileList = new ArrayList<SmbFile>(); fileList.add(folder); for (SmbFile file : fileList) { content = ""; // 獲取文件後綴 String type = file.getName().substring( file.getName().lastIndexOf(".") + 1); if ("txt".equalsIgnoreCase(type)) { content += ReadFile.readTxt(file.getPath(),"gb2312"); } else if ("doc".equalsIgnoreCase(type)) { content += ReadFile.readWorddoc(file.getPath()); } else if ("xls".equalsIgnoreCase(type)) { content += ReadFile.xls2String(file.getPath()); } else if ("xlsx".equalsIgnoreCase(type)) { content += ReadFile.readExcel2007(file.getPath()); } else if ("ppt".equalsIgnoreCase(type)) { content += ReadFile.readPowerPoint(file.getPath()); } else if ("pdf".equalsIgnoreCase(type)) { content += ReadFile.readPdf(file.getPath()); }else if ("docx".equalsIgnoreCase(type)) { content += ReadFile.readWorddocx(file.getPath()); } // System.out.println("name :" + file.getName());//名稱 // System.out.println("path :" + file.getPath());//地址 // System.out.println("content :"+content);//content內容 try { analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); directory = FSDirectory.open(new File(INDEX_DIR)); File indexFile = new File(INDEX_DIR); if (!indexFile.exists()) { indexFile.mkdirs(); } IndexWriterConfig config = new IndexWriterConfig( Version.LUCENE_CURRENT, analyzer); indexWriter = new IndexWriter(directory, config); // String ID = pathname[1].toString();//賦值一個唯一的ID,方便刪除 Document document = new Document(); document.add(new TextField("filename", file.getName(), Store.YES)); document.add(new TextField("content", content, Store.YES)); document.add(new TextField("path", file.getPath(), Store.YES)); // document.add(new TextField("id", ID, Store.YES)); indexWriter.addDocument(document); indexWriter.commit(); ReadFile.closeWriter(indexWriter); } catch (Exception e) { e.printStackTrace(); } content = ""; } } } /** * 查詢索引 * * @Description: TODO * @param @param str 查詢關鍵字 * @param @throws Exception * @return void * @throws * @author wusongxiao * @date 2016年8月10日 */ public static List search(String str) throws Exception { directory = FSDirectory.open(new File(INDEX_DIR)); analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); DirectoryReader ireader = DirectoryReader.open(directory); IndexSearcher isearcher = new IndexSearcher(ireader);
QueryParser parser = new QueryParser(Version.LUCENE_30, "content", analyzer);//LUCENE_30 不分詞查詢,只搜關鍵詞 Query query = parser.parse(str); List liatname = new ArrayList(); ScoreDoc[] hits = isearcher.search(query, null, 10000).scoreDocs; for (int i = 0; i < hits.length; i++) { Document hitDoc = isearcher.doc(hits[i].doc); System.out.println(hitDoc.get("filename"));// 文件名 liatname.add(hitDoc.get("filename")); // System.out.println(hitDoc.get("content"));//內容 } ireader.close(); directory.close(); return liatname; } /** * 更新索引 更新原來索引的內容---只是改變原來文件的索引 * * @Description: TODO * @param @throws Exception * @return void * @throws * @author wusongxiao * @date 2016年8月10日 */ public static void update() throws Exception { String path = "D:\\file\\file\\f1\\test2.txt"; SmbFile folder = new SmbFile(path); List<SmbFile> fileList = new ArrayList<SmbFile>(); fileList.add(folder); Date date1 = new Date(); for (SmbFile file : fileList) { content = ""; // 獲取文件後綴 String type = file.getName().substring( file.getName().lastIndexOf(".") + 1); if ("txt".equalsIgnoreCase(type)) { content += ReadFile.readTxt(file.getPath(),"gb2312"); } else if ("doc".equalsIgnoreCase(type)) { content += ReadFile.readWorddoc(file.getPath()); } else if ("xls".equalsIgnoreCase(type)) { content += ReadFile.xls2String(file.getPath()); } else if ("xlsx".equalsIgnoreCase(type)) { content += ReadFile.readExcel2007(file.getPath()); } else if ("ppt".equalsIgnoreCase(type)) { content += ReadFile.readPowerPoint(file.getPath()); } else if ("pdf".equalsIgnoreCase(type)) { content += ReadFile.readPdf(file.getPath()); }else if ("docx".equalsIgnoreCase(type)) { content += ReadFile.readWorddocx(file.getPath()); } System.out.println("name :" + file.getName()); System.out.println("path :" + file.getPath()); // System.out.println("content :"+content);//content內容 System.out.println(); try { analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); directory = FSDirectory.open(new File(INDEX_DIR)); File indexFile = new File(INDEX_DIR); if (!indexFile.exists()) { indexFile.mkdirs(); } IndexWriterConfig config = new IndexWriterConfig( Version.LUCENE_CURRENT, analyzer); indexWriter = new IndexWriter(directory, config); Long time = date1.getTime(); String tt = time.toString(); Document document = new Document(); document.add(new TextField("filename", file.getName(), Store.YES)); document.add(new TextField("content", content, Store.YES)); document.add(new TextField("path", file.getPath(), Store.YES)); document.add(new TextField("id", tt, Store.YES)); indexWriter.updateDocument(new Term("filename", "text1"), document); indexWriter.close(); ReadFile.closeWriter(indexWriter); } catch (Exception e) { e.printStackTrace(); } content = ""; } Date date2 = new Date(); System.out.println("更新索引耗時:" + (date2.getTime() - date1.getTime()) + "ms\n"); } /** * 創建索引,刪除之前的索引,更新全部文件的索引 * * @Description: TODO * @param @param path * @param @return * @param @throws Exception * @return boolean * @throws * @author wusongxiao * @date 2016年8月10日 */ public static boolean createIndex(String path) throws Exception { // public static boolean createIndex() throws Exception { Date date1 = new Date(); String username = (String) BaseinfoConfigurer .getContextProperty("username"); String possword = (String) BaseinfoConfigurer .getContextProperty("possword"); String fileServerIp = (String) BaseinfoConfigurer .getContextProperty("fileServerIp"); String sharedirectory = (String) BaseinfoConfigurer .getContextProperty("sharedirectory"); path = "smb" + "://" + username + ":" + possword + "@" + fileServerIp + "/" + sharedirectory + "/"; //刪除之前索引 ReadFile.deleteDir(new File(INDEX_DIR+"\\")); // String path = "smb://admini:[email protected]/resource/"; SmbFile folder = new SmbFile(path); SmbFile[] result = searchFile(folder);// 根目錄下的所有文件夾文件 for (SmbFile file : result) { content = ""; // 獲取文件後綴 String type = file.getName().substring( file.getName().lastIndexOf(".") + 1); if ("txt".equalsIgnoreCase(type)) { content += ReadFile.readTxt(file.getPath(),"gb2312"); } else if ("doc".equalsIgnoreCase(type)) { content += ReadFile.readWorddoc(file.getPath()); } else if ("xls".equalsIgnoreCase(type)) { content += ReadFile.xls2String(file.getPath()); } else if ("xlsx".equalsIgnoreCase(type)) { content += ReadFile.readExcel2007(file.getPath()); } else if ("ppt".equalsIgnoreCase(type)) { content += ReadFile.readPowerPoint(file.getPath()); } else if ("pdf".equalsIgnoreCase(type)) { content += ReadFile.readPdf(file.getPath()); }else if ("docx".equalsIgnoreCase(type)) { content += ReadFile.readWorddocx(file.getPath()); } System.out.println("name :" + file.getName()); System.out.println("path :" + file.getPath()); // System.out.println("content :"+content); System.out.println(); try { analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); directory = FSDirectory.open(new File(INDEX_DIR)); File indexFile = new File(INDEX_DIR); if (!indexFile.exists()) { indexFile.mkdirs(); } IndexWriterConfig config = new IndexWriterConfig( Version.LUCENE_CURRENT, analyzer); indexWriter = new IndexWriter(directory, config); Document document = new Document(); document.add(new TextField("filename", file.getName(), Store.YES)); document.add(new TextField("content", content, Store.YES)); document.add(new TextField("path", file.getPath(), Store.YES)); // document.add(new TextField("id", tt, Store.YES)); indexWriter.addDocument(document); indexWriter.commit(); ReadFile.closeWriter(indexWriter); } catch (Exception e) { e.printStackTrace(); } content = ""; } Date date2 = new Date(); System.out.println("創建索引-----耗時:" + (date2.getTime() - date1.getTime()) + "ms\n"); return true; } /** * 遞歸查找所有的文件 * * @Description: TODO * @param @param folder * @param @return * @return SmbFile[] * @throws * @author wusongxiao * @date 2016年8月10日 */ public static SmbFile[] searchFile(SmbFile folder) { SmbFile[] subFolders = null; try { subFolders = folder.listFiles(new SmbFileFilter() { // 運用內部匿名類獲得文件 @Override public boolean accept(SmbFile pathname) {// 實現FileFilter類的accept方法 try { if (pathname.isDirectory() || (pathname.isFile())) {// 目錄或文件包含關鍵字 return true; } } catch (SmbException e) { } return false; } }); } catch (SmbException e1) { e1.printStackTrace(); } List<SmbFile> result = new ArrayList<SmbFile>();// 聲明一個集合 for (int i = 0; i < subFolders.length; i++) {// 循環顯示文件夾或文件 try { if (subFolders[i].isFile()) {// 如果是文件則將文件添加到結果列表中 result.add(subFolders[i]); } else {// 如果是文件夾,則遞歸調用本方法,然後把所有的文件加到結果列表中 SmbFile[] foldResult = searchFile(subFolders[i]); for (int j = 0; j < foldResult.length; j++) {// 循環顯示文件 String smname = foldResult[j].toString(); String txtname = smname.substring(smname .lastIndexOf("/") + 1);// 截取文件名 String txtName = txtname.substring(txtname .lastIndexOf("."));// 截取格式 if (".txt".equals(txtName)) { result.add(foldResult[j]);// 文件保存到集合中 } else if (".ppt".equals(txtName)) { result.add(foldResult[j]);// 文件保存到集合中 } else if (".doc".equals(txtName)) { result.add(foldResult[j]);// 文件保存到集合中 } else if (".xls".equals(txtName)) { result.add(foldResult[j]);// 文件保存到集合中 } else if (".xlsx".equals(txtName)) { result.add(foldResult[j]);// 文件保存到集合中 } else if (".pdf".equals(txtName)) { result.add(foldResult[j]);// 文件保存到集合中 } } } } catch (SmbException e) { e.printStackTrace(); } } SmbFile files[] = new SmbFile[result.size()];// 聲明文件數組,長度為集合的長度 result.toArray(files);// 集合數組化 return files; } }
2.讀取文檔的方法類 txt,xlsx,xls,ppt,pdf,doc, docx(不能讀取圖片)
package com.strongit.tool.retrieval; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import java.util.List; import jcifs.smb.SmbFileInputStream; import jxl.Cell; import jxl.Sheet; import jxl.Workbook; import org.apache.lucene.index.IndexWriter; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.util.PDFTextStripper; import org.apache.poi.hslf.HSLFSlideShow; import org.apache.poi.hslf.model.Slide; import org.apache.poi.hslf.model.TextRun; import org.apache.poi.hslf.usermodel.SlideShow; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.usermodel.Paragraph; import org.apache.poi.hwpf.usermodel.Range; import org.apache.poi.xssf.usermodel.XSSFCell; import org.apache.poi.xssf.usermodel.XSSFRow; import org.apache.poi.xssf.usermodel.XSSFSheet; import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFParagraph; /** * 讀取文檔方法 * ClassName: ReadFile * @Description: TODO * @date 2016年8月10日 * @author wsx */ public class ReadFile { private static ReadFile indexManager; /** * 讀取doc文件內容 * @param filepath 想要讀取的文件地址 * @return 返回文件內容 */ public static String readWorddoc(String filepath) { StringBuffer content = new StringBuffer("");// 文檔內容 try { HWPFDocument doc = new HWPFDocument(new SmbFileInputStream(filepath)); Range range = doc.getRange(); int paragraphCount = range.numParagraphs();// 段落 for (int i = 0; i < paragraphCount; i++) {// 遍歷段落讀取數據 Paragraph pp = range.getParagraph(i); content.append(pp.text()); } } catch (Exception e) { e.printStackTrace(); } return content.toString().trim(); } /** * docx 格式建立索引,圖片沒有讀到,只讀取的數據 * @Description: TODO * @param @param filepath * @param @return * @return String * @date 2016年8月12日 */ public static String readWorddocx(String filepath) { StringBuffer content = new StringBuffer("");// 文檔內容 try { // D://file//docx.docx D://file//doc.doc // filepath = "D://file//docx.docx"; SmbFileInputStream in = new SmbFileInputStream(filepath);//載入文檔 //word docx 圖片不會被讀取,只讀取數據 XWPFDocument xwpf = new XWPFDocument(in);//得到word文檔的信息 List<XWPFParagraph> listParagraphs = xwpf.getParagraphs();//得到段落信息 for(int i =0;i<listParagraphs.size();i++){ String cont = listParagraphs.get(i).getRuns().toString(); content.append(cont); } } catch (Exception e) { e.printStackTrace(); } return content.toString().trim(); } /** * 讀取xls文件內容 * @param filepath 想要讀取的文件對象 * @return 返回文件內容 */ public static String xls2String(String filepath){ String result = ""; try{ SmbFileInputStream fis = new SmbFileInputStream(filepath); StringBuilder sb = new StringBuilder(); jxl.Workbook rwb = Workbook.getWorkbook(fis); Sheet[] sheet = rwb.getSheets(); for (int i = 0; i < sheet.length; i++) { Sheet rs = rwb.getSheet(i); for (int j = 0; j < rs.getRows(); j++) { Cell[] cells = rs.getRow(j); for(int k=0;k<cells.length;k++) sb.append(cells[k].getContents() + " "); } } fis.close(); result += sb.toString(); }catch(Exception e){ e.printStackTrace(); } return result; } /** * PDF格式 文件創建索引 * @Description: TODO * @param @param path * @param @return * @param @throws Exception * @return String * @date 2016年8月11日 */ public static String readPdf(String path) throws Exception { StringBuffer content = new StringBuffer("");// 文檔內容 SmbFileInputStream fis = new SmbFileInputStream(path); PDFParser p = new PDFParser(fis); p.parse(); PDFTextStripper ts = new PDFTextStripper(); content.append(ts.getText(p.getPDDocument())); fis.close(); return content.toString().trim(); } /** * 讀取xlsx格式的excel文檔 * @param @param filepath * @param @throws IOException * @author wusongxiao * @date 2016年8月10日 */ public static String readExcel2007(String filepath) throws IOException { // System.out.println(filepath); StringBuffer content = new StringBuffer(); // 構造 XSSFWorkbook 對象,strPath 傳入文件路徑 **** SmbFileInputStream SMB讀取文件 *** XSSFWorkbook xwb = new XSSFWorkbook(new SmbFileInputStream(filepath)); // 循環工作表Sheet for (int numSheet = 0; numSheet < xwb.getNumberOfSheets(); numSheet++) { XSSFSheet xSheet = xwb.getSheetAt(numSheet); if (xSheet == null) { continue; } // 循環行Row for (int rowNum = 0; rowNum <= xSheet.getLastRowNum(); rowNum++) { XSSFRow xRow = xSheet.getRow(rowNum); if (xRow == null) { continue; } // 循環列Cell for (int cellNum = 0; cellNum <= xRow.getLastCellNum(); cellNum++) { XSSFCell xCell = xRow.getCell(cellNum); if (xCell == null) { continue; } String s = null; if (xCell.getCellType() == XSSFCell.CELL_TYPE_BOOLEAN) { content.append(xCell.getBooleanCellValue()); } else if (xCell.getCellType() == XSSFCell.CELL_TYPE_NUMERIC) { content.append(xCell.getNumericCellValue()); } else { content.append(xCell.getStringCellValue() + ""); //+ "\n" } } } } return content.toString(); } /** * 讀取txt文檔 * @param @param filepath 地址 * @param @param charSet 編碼格式 * @param @throws IOException * @author wusongxiao * @date 2016年8月10日 */ public static String readTxt(String filepath, String charSet) throws IOException { BufferedReader reader = new BufferedReader(new InputStreamReader( new SmbFileInputStream(filepath), charSet)); //reader.readLine() 讀取txt文本 String的 String line = new String(); String temp = new String(); while ((line = reader.readLine()) != null) { temp += line; } reader.close(); return temp; } /** * 讀取ppt文件 * @Description: TODO * @param @param filepath * @param @return * @return String * @date 2016年8月10日 */ public static String readPowerPoint(String filepath) { StringBuffer content = new StringBuffer(""); try { SlideShow ss = new SlideShow(new HSLFSlideShow(new SmbFileInputStream(filepath)));// is // 為文件的InputStream,建立SlideShow Slide[] slides = ss.getSlides();// 獲得每一張幻燈片 for (int i = 0; i < slides.length; i++) { TextRun[] t = slides[i].getTextRuns();// 為了取得幻燈片的文字內容,建立TextRun for (int j = 0; j < t.length; j++) { content.append(t[j].getText());// 這裡會將文字內容加到content中去 } } } catch (Exception ex) { System.out.println(ex.toString()); } return content.toString(); } public static void closeWriter(IndexWriter indexWriter) throws Exception { if (indexWriter != null) { indexWriter.close(); } } /** * 創建索引管理器 * @return 返回索引管理器對象 */ public ReadFile getManager(){ if(indexManager == null){ this.indexManager = new ReadFile(); } return indexManager; } /** * 刪除目錄下的所有索引 * @Description: TODO * @param @param file * @param @return * @return boolean * @throws * @author wusongxiao * @date 2016年8月10日 */ public static boolean deleteDir(File file){ if(file.isDirectory()){ File[] files = file.listFiles(); for(int i=0; i<files.length; i++){ deleteDir(files[i]); } } file.delete(); return true; } }
整個都是基於SMB 文件服務器的lucene4.0全文檢索,如果是本地文件的話 只需要把所有的地址 類似 SmbFileInputStream 去掉 Smb 就可以了