程式師世界 >> 編程語言 >> JAVA編程 >> JAVA綜合教程 >> lucene4.0 基於smb文件服務器的全文檢索，lucene4.0smb

lucene4.0 基於smb文件服務器的全文檢索，lucene4.0smb

編輯：JAVA綜合教程

lucene4.0 基於smb文件服務器的全文檢索，lucene4.0smb

使用lucene 4.0版本的全文檢索

所需要的jar包

網速太慢，下次有空再把jar傳上來

1.FileIndex 建立索引，查詢，刪除，更新

package com.strongit.tool.retrieval;

import java.io.File;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import jcifs.smb.SmbException;
import jcifs.smb.SmbFile;
import jcifs.smb.SmbFileFilter;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

import com.strongit.util.BaseinfoConfigurer;

public class FileIndex {
	
	private static String INDEX_DIR = "D:\\index";
	private static Analyzer analyzer = null;
	private static Directory directory = null;
	private static IndexWriter indexWriter = null;
	private static String content = "";
	
	 public static void main(String[] args) {
	        try {
//	        createIndex();//創建索引
//	            search("測試");
//	            insert();//新增索引，不刪除之前的
//	            delete("1470817624520");
//	            update();
	        } catch (Exception e) {
	            e.printStackTrace();
	        }
	    }

	/**
	 * 刪除索引
	 * 
	 * @param @param str 刪除的關鍵字 建立索引時的id
	 * @param @throws Exception
	 * @author wusongxiao
	 * @date 2016年8月10日
	 */
	public static void delete(String str) throws Exception {
		Date date1 = new Date();
		analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
		directory = FSDirectory.open(new File(INDEX_DIR));

		IndexWriterConfig config = new IndexWriterConfig(
				Version.LUCENE_CURRENT, analyzer);
		indexWriter = new IndexWriter(directory, config);

		// indexWriter.deleteDocuments(new Term("filename",str));

		indexWriter.deleteDocuments(new Term("id", str)); // 建立索引時 給這個索引賦一個id

		indexWriter.close();

		Date date2 = new Date();
		System.out.println("刪除索引耗時：" + (date2.getTime() - date1.getTime())
				+ "ms\n");
	}

	/**
	 * 新增加索引，不覆蓋之前的
	 * 
	 * @Description: TODO
	 * @param @throws Exception
	 * @return void
	 * @throws
	 * @author wusongxiao
	 * @date 2016年8月10日
	 */
	public static void insert(List listname) throws Exception {

//		String path = "smb://admini:[email protected]/resource/Teaching/test001.txt";

		for(int j =0;j<listname.size();j++){
			
			String path=   listname.get(j); //文件地址
			SmbFile folder = new SmbFile(path);
			List<SmbFile> fileList = new ArrayList<SmbFile>();
			fileList.add(folder);
			for (SmbFile file : fileList) {
				content = "";
				// 獲取文件後綴
				String type = file.getName().substring(
						file.getName().lastIndexOf(".") + 1);
				if ("txt".equalsIgnoreCase(type)) {

					content += ReadFile.readTxt(file.getPath(),"gb2312");

				} else if ("doc".equalsIgnoreCase(type)) {

					content += ReadFile.readWorddoc(file.getPath());

				} else if ("xls".equalsIgnoreCase(type)) {

					content += ReadFile.xls2String(file.getPath());

				} else if ("xlsx".equalsIgnoreCase(type)) {

					content += ReadFile.readExcel2007(file.getPath());

				} else if ("ppt".equalsIgnoreCase(type)) {

					content += ReadFile.readPowerPoint(file.getPath());

				} else if ("pdf".equalsIgnoreCase(type)) {

					content += ReadFile.readPdf(file.getPath());

				}else if ("docx".equalsIgnoreCase(type)) {

					content += ReadFile.readWorddocx(file.getPath());

				}
				
				

//				System.out.println("name :" + file.getName());//名稱
//				System.out.println("path :" + file.getPath());//地址
//                              System.out.println("content :"+content);//content內容

				try {
					analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
					directory = FSDirectory.open(new File(INDEX_DIR));

					File indexFile = new File(INDEX_DIR);
					if (!indexFile.exists()) {
						indexFile.mkdirs();
					}
					IndexWriterConfig config = new IndexWriterConfig(
							Version.LUCENE_CURRENT, analyzer);
					indexWriter = new IndexWriter(directory, config);
			//		String ID = pathname[1].toString();//賦值一個唯一的ID，方便刪除
					Document document = new Document();
					document.add(new TextField("filename", file.getName(),
							Store.YES));
					document.add(new TextField("content", content, Store.YES));
					document.add(new TextField("path", file.getPath(), Store.YES));
			//		document.add(new TextField("id", ID, Store.YES));
					indexWriter.addDocument(document);
					indexWriter.commit();

					ReadFile.closeWriter(indexWriter);

				} catch (Exception e) {
					e.printStackTrace();
				}
				content = "";
			}
			
		}
		
	}

	/**
	 * 查詢索引
	 * 
	 * @Description: TODO
	 * @param @param str 查詢關鍵字
	 * @param @throws Exception
	 * @return void
	 * @throws
	 * @author wusongxiao
	 * @date 2016年8月10日
	 */
	public static List search(String str) throws Exception {
		directory = FSDirectory.open(new File(INDEX_DIR));
		analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
		DirectoryReader ireader = DirectoryReader.open(directory);
		IndexSearcher isearcher = new IndexSearcher(ireader);

		QueryParser parser = new QueryParser(Version.LUCENE_30, "content",
				analyzer);//LUCENE_30  不分詞查詢，只搜關鍵詞
		Query query = parser.parse(str);
		List liatname = new ArrayList();
		ScoreDoc[] hits = isearcher.search(query, null, 10000).scoreDocs;
		for (int i = 0; i < hits.length; i++) {
			Document hitDoc = isearcher.doc(hits[i].doc);
			System.out.println(hitDoc.get("filename"));// 文件名
			liatname.add(hitDoc.get("filename"));
			// System.out.println(hitDoc.get("content"));//內容
		}
		ireader.close();
		directory.close();
		
		return liatname;
	}

	/**
	 * 更新索引 更新原來索引的內容---只是改變原來文件的索引
	 * 
	 * @Description: TODO
	 * @param @throws Exception
	 * @return void
	 * @throws
	 * @author wusongxiao
	 * @date 2016年8月10日
	 */
	public static void update() throws Exception {

		String path = "D:\\file\\file\\f1\\test2.txt";

		SmbFile folder = new SmbFile(path);
		List<SmbFile> fileList = new ArrayList<SmbFile>();
		fileList.add(folder);
		Date date1 = new Date();
		for (SmbFile file : fileList) {
			content = "";
			// 獲取文件後綴
			String type = file.getName().substring(
					file.getName().lastIndexOf(".") + 1);
			if ("txt".equalsIgnoreCase(type)) {

				content += ReadFile.readTxt(file.getPath(),"gb2312");

			} else if ("doc".equalsIgnoreCase(type)) {

				content += ReadFile.readWorddoc(file.getPath());

			} else if ("xls".equalsIgnoreCase(type)) {

				content += ReadFile.xls2String(file.getPath());

			} else if ("xlsx".equalsIgnoreCase(type)) {

				content += ReadFile.readExcel2007(file.getPath());

			} else if ("ppt".equalsIgnoreCase(type)) {

				content += ReadFile.readPowerPoint(file.getPath());

			} else if ("pdf".equalsIgnoreCase(type)) {

				content += ReadFile.readPdf(file.getPath());

			}else if ("docx".equalsIgnoreCase(type)) {

				content += ReadFile.readWorddocx(file.getPath());

			}

			System.out.println("name :" + file.getName());
			System.out.println("path :" + file.getPath());
			// System.out.println("content :"+content);//content內容
			System.out.println();

			try {
				analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
				directory = FSDirectory.open(new File(INDEX_DIR));

				File indexFile = new File(INDEX_DIR);
				if (!indexFile.exists()) {
					indexFile.mkdirs();
				}
				IndexWriterConfig config = new IndexWriterConfig(
						Version.LUCENE_CURRENT, analyzer);
				indexWriter = new IndexWriter(directory, config);
				Long time = date1.getTime();
				String tt = time.toString();
				Document document = new Document();
				document.add(new TextField("filename", file.getName(),
						Store.YES));
				document.add(new TextField("content", content, Store.YES));
				document.add(new TextField("path", file.getPath(), Store.YES));
				document.add(new TextField("id", tt, Store.YES));
				indexWriter.updateDocument(new Term("filename", "text1"),
						document);
				indexWriter.close();

				ReadFile.closeWriter(indexWriter);

			} catch (Exception e) {
				e.printStackTrace();
			}
			content = "";
		}

		Date date2 = new Date();
		System.out.println("更新索引耗時：" + (date2.getTime() - date1.getTime())
				+ "ms\n");
	}

	/**
	 * 創建索引，刪除之前的索引，更新全部文件的索引
	 * 
	 * @Description: TODO
	 * @param @param path
	 * @param @return
	 * @param @throws Exception
	 * @return boolean
	 * @throws
	 * @author wusongxiao
	 * @date 2016年8月10日
	 */
	public static boolean createIndex(String path) throws Exception {
		//	public static boolean createIndex() throws Exception {
		Date date1 = new Date();

		String username = (String) BaseinfoConfigurer
				.getContextProperty("username");
		String possword = (String) BaseinfoConfigurer
				.getContextProperty("possword");
		String fileServerIp = (String) BaseinfoConfigurer
				.getContextProperty("fileServerIp");
		String sharedirectory = (String) BaseinfoConfigurer
				.getContextProperty("sharedirectory");

		path = "smb" + "://" + username + ":" + possword + "@" + fileServerIp + "/" + sharedirectory + "/";
		
		//刪除之前索引
		ReadFile.deleteDir(new File(INDEX_DIR+"\\"));
		
//		String path = "smb://admini:[email protected]/resource/";
		
		
		
		SmbFile folder = new SmbFile(path);
		SmbFile[] result = searchFile(folder);// 根目錄下的所有文件夾文件
		for (SmbFile file : result) {
			content = "";
			// 獲取文件後綴
			String type = file.getName().substring(
					file.getName().lastIndexOf(".") + 1);
			if ("txt".equalsIgnoreCase(type)) {

				content += ReadFile.readTxt(file.getPath(),"gb2312");

			} else if ("doc".equalsIgnoreCase(type)) {

				content += ReadFile.readWorddoc(file.getPath());

			} else if ("xls".equalsIgnoreCase(type)) {

				content += ReadFile.xls2String(file.getPath());

			} else if ("xlsx".equalsIgnoreCase(type)) {

				content += ReadFile.readExcel2007(file.getPath());

			} else if ("ppt".equalsIgnoreCase(type)) {

				content += ReadFile.readPowerPoint(file.getPath());

			} else if ("pdf".equalsIgnoreCase(type)) {

				content += ReadFile.readPdf(file.getPath());

			}else if ("docx".equalsIgnoreCase(type)) {

				content += ReadFile.readWorddocx(file.getPath());

			}

			System.out.println("name :" + file.getName());
			System.out.println("path :" + file.getPath());
			// System.out.println("content :"+content);
			System.out.println();

			try {
				analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
				directory = FSDirectory.open(new File(INDEX_DIR));

				File indexFile = new File(INDEX_DIR);
				if (!indexFile.exists()) {
					indexFile.mkdirs();
				}
				IndexWriterConfig config = new IndexWriterConfig(
						Version.LUCENE_CURRENT, analyzer);
				indexWriter = new IndexWriter(directory, config);
				
				Document document = new Document();
				document.add(new TextField("filename", file.getName(), Store.YES));
				document.add(new TextField("content", content, Store.YES));
				document.add(new TextField("path", file.getPath(), Store.YES));
			//	document.add(new TextField("id", tt, Store.YES));
				indexWriter.addDocument(document);
				indexWriter.commit();
				ReadFile.closeWriter(indexWriter);

			} catch (Exception e) {
				e.printStackTrace();
			}
			content = "";
		}
		Date date2 = new Date();
		System.out.println("創建索引-----耗時：" + (date2.getTime() - date1.getTime())
				+ "ms\n");
		return true;
	}

	/**
	 * 遞歸查找所有的文件
	 * 
	 * @Description: TODO
	 * @param @param folder
	 * @param @return
	 * @return SmbFile[]
	 * @throws
	 * @author wusongxiao
	 * @date 2016年8月10日
	 */
	public static SmbFile[] searchFile(SmbFile folder) {
		SmbFile[] subFolders = null;
		try {
			subFolders = folder.listFiles(new SmbFileFilter() { // 運用內部匿名類獲得文件
						@Override
						public boolean accept(SmbFile pathname) {// 實現FileFilter類的accept方法
							try {
								if (pathname.isDirectory()
										|| (pathname.isFile())) {// 目錄或文件包含關鍵字
									return true;
								}
							} catch (SmbException e) {
							}
							return false;
						}
					});
		} catch (SmbException e1) {
			e1.printStackTrace();
		}

		List<SmbFile> result = new ArrayList<SmbFile>();// 聲明一個集合
		for (int i = 0; i < subFolders.length; i++) {// 循環顯示文件夾或文件

			try {
				if (subFolders[i].isFile()) {// 如果是文件則將文件添加到結果列表中
					result.add(subFolders[i]);
				} else {// 如果是文件夾，則遞歸調用本方法，然後把所有的文件加到結果列表中
					SmbFile[] foldResult = searchFile(subFolders[i]);
					for (int j = 0; j < foldResult.length; j++) {// 循環顯示文件

						String smname = foldResult[j].toString();
						String txtname = smname.substring(smname
								.lastIndexOf("/") + 1);// 截取文件名
						String txtName = txtname.substring(txtname
								.lastIndexOf("."));// 截取格式

						if (".txt".equals(txtName)) {
							result.add(foldResult[j]);// 文件保存到集合中
						} else if (".ppt".equals(txtName)) {
							result.add(foldResult[j]);// 文件保存到集合中
						} else if (".doc".equals(txtName)) {
							result.add(foldResult[j]);// 文件保存到集合中
						} else if (".xls".equals(txtName)) {
							result.add(foldResult[j]);// 文件保存到集合中
						} else if (".xlsx".equals(txtName)) {
							result.add(foldResult[j]);// 文件保存到集合中
						} else if (".pdf".equals(txtName)) {
							result.add(foldResult[j]);// 文件保存到集合中
						}

					}
				}
			} catch (SmbException e) {
				e.printStackTrace();
			}

		}
		SmbFile files[] = new SmbFile[result.size()];// 聲明文件數組，長度為集合的長度
		result.toArray(files);// 集合數組化
		return files;
	}

}

2.讀取文檔的方法類 txt,xlsx,xls,ppt,pdf,doc, docx(不能讀取圖片)

package com.strongit.tool.retrieval;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.List;  

import jcifs.smb.SmbFileInputStream;
import jxl.Cell;
import jxl.Sheet;
import jxl.Workbook;

import org.apache.lucene.index.IndexWriter;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.usermodel.SlideShow;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;

/**
 * 讀取文檔方法
 * ClassName: ReadFile 
 * @Description: TODO
 * @date 2016年8月10日
 * @author wsx
 */
public class ReadFile {
	
	 private static ReadFile indexManager;
	
	 /**
     * 讀取doc文件內容
     * @param filepath 想要讀取的文件地址
     * @return 返回文件內容
     */
	 public static String readWorddoc(String filepath) {
			StringBuffer content = new StringBuffer("");// 文檔內容
			try {
				HWPFDocument doc = new HWPFDocument(new SmbFileInputStream(filepath));
				Range range = doc.getRange();
				int paragraphCount = range.numParagraphs();// 段落
				for (int i = 0; i < paragraphCount; i++) {// 遍歷段落讀取數據
					Paragraph pp = range.getParagraph(i);
					content.append(pp.text());
				}

			} catch (Exception e) {
				e.printStackTrace();
			}
			return content.toString().trim();
		}
	 /**
	  * docx 格式建立索引，圖片沒有讀到，只讀取的數據  
	  * @Description: TODO
	  * @param @param filepath
	  * @param @return   
	  * @return String  
	  * @date 2016年8月12日
	  */
	 public static String readWorddocx(String filepath) {
			StringBuffer content = new StringBuffer("");// 文檔內容
			try {  
				//     D://file//docx.docx     D://file//doc.doc
//				filepath = "D://file//docx.docx";
				SmbFileInputStream in = new SmbFileInputStream(filepath);//載入文檔 
				//word docx 圖片不會被讀取，只讀取數據   
                XWPFDocument xwpf = new XWPFDocument(in);//得到word文檔的信息  
              List<XWPFParagraph> listParagraphs = xwpf.getParagraphs();//得到段落信息
              
              for(int i =0;i<listParagraphs.size();i++){
            	  String cont = listParagraphs.get(i).getRuns().toString();
            	  content.append(cont);
              }
               
	        } catch (Exception e) {  
	            e.printStackTrace();  
	        }  
			return content.toString().trim();
		}
    
    /**
     * 讀取xls文件內容
     * @param filepath 想要讀取的文件對象
     * @return 返回文件內容
     */
    public static String xls2String(String filepath){
        String result = "";
        try{
            SmbFileInputStream fis = new SmbFileInputStream(filepath);   
            StringBuilder sb = new StringBuilder();   
            jxl.Workbook rwb = Workbook.getWorkbook(fis);   
            Sheet[] sheet = rwb.getSheets();   
            for (int i = 0; i < sheet.length; i++) {   
                Sheet rs = rwb.getSheet(i);   
                for (int j = 0; j < rs.getRows(); j++) {   
                   Cell[] cells = rs.getRow(j);   
                   for(int k=0;k<cells.length;k++)   
                   sb.append(cells[k].getContents() + " ");   
                }   
            }   
            fis.close();   
            result += sb.toString();
        }catch(Exception e){
            e.printStackTrace();
        }
        return result;
    }
    /**
     * PDF格式  文件創建索引
     * @Description: TODO
     * @param @param path
     * @param @return
     * @param @throws Exception   
     * @return String  
     * @date 2016年8月11日
     */
    public static String readPdf(String path) throws Exception {
        StringBuffer content = new StringBuffer("");// 文檔內容
        SmbFileInputStream fis = new SmbFileInputStream(path);
        PDFParser p = new PDFParser(fis);
        p.parse();
        PDFTextStripper ts = new PDFTextStripper();
        content.append(ts.getText(p.getPDDocument()));
        fis.close();
        return content.toString().trim();
    }
    
    /**
     * 讀取xlsx格式的excel文檔
     * @param @param filepath
     * @param @throws IOException   
     * @author wusongxiao
     * @date 2016年8月10日
     */
    public static String readExcel2007(String filepath) throws IOException {

//		System.out.println(filepath);

		StringBuffer content = new StringBuffer();

		// 構造 XSSFWorkbook 對象，strPath 傳入文件路徑 **** SmbFileInputStream SMB讀取文件 ***
		XSSFWorkbook xwb = new XSSFWorkbook(new SmbFileInputStream(filepath));
		// 循環工作表Sheet
		for (int numSheet = 0; numSheet < xwb.getNumberOfSheets(); numSheet++) {
			XSSFSheet xSheet = xwb.getSheetAt(numSheet);
			if (xSheet == null) {
				continue;
			}
			// 循環行Row
			for (int rowNum = 0; rowNum <= xSheet.getLastRowNum(); rowNum++) {
				XSSFRow xRow = xSheet.getRow(rowNum);
				if (xRow == null) {
					continue;
				}
				// 循環列Cell
				for (int cellNum = 0; cellNum <= xRow.getLastCellNum(); cellNum++) {
					XSSFCell xCell = xRow.getCell(cellNum);
					if (xCell == null) {
						continue;
					}
					String s = null;
					if (xCell.getCellType() == XSSFCell.CELL_TYPE_BOOLEAN) {
						content.append(xCell.getBooleanCellValue());
					} else if (xCell.getCellType() == XSSFCell.CELL_TYPE_NUMERIC) {
						content.append(xCell.getNumericCellValue());
					} else {
						content.append(xCell.getStringCellValue()  + ""); //+ "\n"
					}
				}
			}
		}

		return content.toString();
	}
    /**
     * 讀取txt文檔
     * @param @param filepath  地址
     * @param @param charSet  編碼格式
     * @param @throws IOException   
     * @author wusongxiao
     * @date 2016年8月10日
     */
    public static String readTxt(String filepath, String charSet)
			throws IOException {
		BufferedReader reader = new BufferedReader(new InputStreamReader(
				new SmbFileInputStream(filepath), charSet));  //reader.readLine() 讀取txt文本  String的
		String line = new String();
		String temp = new String();
		while ((line = reader.readLine()) != null) {
			temp += line;
		}
		reader.close();
		return temp;
	}
    /**
     * 讀取ppt文件
     * @Description: TODO
     * @param @param filepath
     * @param @return   
     * @return String  
     * @date 2016年8月10日
     */
    public static String readPowerPoint(String filepath) {
		StringBuffer content = new StringBuffer("");
		try {
			SlideShow ss = new SlideShow(new HSLFSlideShow(new SmbFileInputStream(filepath)));// is
			// 為文件的InputStream，建立SlideShow
			Slide[] slides = ss.getSlides();// 獲得每一張幻燈片
			for (int i = 0; i < slides.length; i++) {
				TextRun[] t = slides[i].getTextRuns();// 為了取得幻燈片的文字內容，建立TextRun
				for (int j = 0; j < t.length; j++) {
					content.append(t[j].getText());// 這裡會將文字內容加到content中去
				}
			}
		} catch (Exception ex) {
			System.out.println(ex.toString());
		}
		return content.toString();
	}
    
    public static void closeWriter(IndexWriter indexWriter) throws Exception {
        if (indexWriter != null) {
            indexWriter.close();
        }
    }
    
    /**
     * 創建索引管理器
     * @return 返回索引管理器對象
     */
    public ReadFile getManager(){
        if(indexManager == null){
            this.indexManager = new ReadFile();
        }
        return indexManager;
    }
    /**
     * 刪除目錄下的所有索引
     * @Description: TODO
     * @param @param file
     * @param @return   
     * @return boolean  
     * @throws
     * @author wusongxiao
     * @date 2016年8月10日
     */
    public static boolean deleteDir(File file){
        if(file.isDirectory()){
            File[] files = file.listFiles();
            for(int i=0; i<files.length; i++){
                deleteDir(files[i]);
            }
        }
        file.delete();
        return true;
    }

}

整個都是基於SMB 文件服務器的lucene4.0全文檢索，如果是本地文件的話只需要把所有的地址類似 SmbFileInputStream 去掉 Smb 就可以了