java應用Nagao算法完成新詞發明、熱點詞的發掘。本站提示廣大學習愛好者:(java應用Nagao算法完成新詞發明、熱點詞的發掘)文章只能為提供參考,不一定能成為您想要的結果。以下是java應用Nagao算法完成新詞發明、熱點詞的發掘正文
采取Nagao算法統計各個子字符串的頻次,然後基於這些頻次統計每一個字符串的詞頻、閣下鄰個數、閣下熵、交互信息(外部凝集度)。
名詞說明:
Nagao算法:一種疾速的統計文本裡一切子字符串頻次的算法。具體算法可見http://www.doc88.com/p-664123446503.html
詞頻:該字符串在文檔中湧現的次數。湧現次數越多越主要。
閣下鄰個數:文檔中該字符串的右邊和左邊湧現的分歧的字的個數。閣下鄰越多,解釋字符串成詞幾率越高。
閣下熵:文檔中該字符串的右邊和左邊湧現的分歧的字的數目散布的熵。相似下面的目標,有必定差別。
交互信息:每次將某字符串分紅兩部門,左半部門字符串和右半部門字符串,盤算其同時湧現的幾率除於其各自自力湧現的幾率,最初取一切的劃分外面幾率最小值。這個值越年夜,解釋字符串外部凝集度越高,越能夠成詞。
算法詳細流程:
1. 將輸出文件逐行讀入,依照非漢字([^\u4E00-\u9FA5]+)和停詞“的很了麼呢是嘛個都也比還這於不與才上用就好在和對挺去後沒說”,
分紅一個個字符串,代碼以下:
String[] phrases = line.split("[^\u4E00-\u9FA5]+|["+stopwords+"]");
停用詞可以修正。
2. 獲得一切切分後的字符串的左子串和右子串,分離參加左、右PTable
3. 對PTable排序,並盤算LTable。LTable記載的是,排序後的PTable中,下一個子串同上一個子串具有雷同字符的數目
4. 遍歷PTable和LTable,便可獲得一切子字符串的詞頻、閣下鄰
5. 依據一切子字符串的詞頻、閣下鄰成果,輸入字符串的詞頻、閣下鄰個數、閣下熵、交互信息
1. NagaoAlgorithm.java
package com.algo.word; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; public class NagaoAlgorithm { private int N; private List<String> leftPTable; private int[] leftLTable; private List<String> rightPTable; private int[] rightLTable; private double wordNumber; private Map<String, TFNeighbor> wordTFNeighbor; private final static String stopwords = "的很了麼呢是嘛個都也比還這於不與才上用就好在和對挺去後沒說"; private NagaoAlgorithm(){ //default N = 5 N = 5; leftPTable = new ArrayList<String>(); rightPTable = new ArrayList<String>(); wordTFNeighbor = new HashMap<String, TFNeighbor>(); } //reverse phrase private String reverse(String phrase) { StringBuilder reversePhrase = new StringBuilder(); for (int i = phrase.length() - 1; i >= 0; i--) reversePhrase.append(phrase.charAt(i)); return reversePhrase.toString(); } //co-prefix length of s1 and s2 private int coPrefixLength(String s1, String s2){ int coPrefixLength = 0; for(int i = 0; i < Math.min(s1.length(), s2.length()); i++){ if(s1.charAt(i) == s2.charAt(i)) coPrefixLength++; else break; } return coPrefixLength; } //add substring of line to pTable private void addToPTable(String line){ //split line according to consecutive none Chinese character String[] phrases = line.split("[^\u4E00-\u9FA5]+|["+stopwords+"]"); for(String phrase : phrases){ for(int i = 0; i < phrase.length(); i++) rightPTable.add(phrase.substring(i)); String reversePhrase = reverse(phrase); for(int i = 0; i < reversePhrase.length(); i++) leftPTable.add(reversePhrase.substring(i)); wordNumber += phrase.length(); } } //count lTable private void countLTable(){ Collections.sort(rightPTable); rightLTable = new int[rightPTable.size()]; for(int i = 1; i < rightPTable.size(); i++) rightLTable[i] = coPrefixLength(rightPTable.get(i-1), rightPTable.get(i)); Collections.sort(leftPTable); leftLTable = new int[leftPTable.size()]; for(int i = 1; i < leftPTable.size(); i++) leftLTable[i] = coPrefixLength(leftPTable.get(i-1), leftPTable.get(i)); System.out.println("Info: [Nagao Algorithm Step 2]: having sorted PTable and counted left and right LTable"); } //according to pTable and lTable, count statistical result: TF, neighbor distribution private void countTFNeighbor(){ //get TF and right neighbor for(int pIndex = 0; pIndex < rightPTable.size(); pIndex++){ String phrase = rightPTable.get(pIndex); for(int length = 1 + rightLTable[pIndex]; length <= N && length <= phrase.length(); length++){ String word = phrase.substring(0, length); TFNeighbor tfNeighbor = new TFNeighbor(); tfNeighbor.incrementTF(); if(phrase.length() > length) tfNeighbor.addToRightNeighbor(phrase.charAt(length)); for(int lIndex = pIndex+1; lIndex < rightLTable.length; lIndex++){ if(rightLTable[lIndex] >= length){ tfNeighbor.incrementTF(); String coPhrase = rightPTable.get(lIndex); if(coPhrase.length() > length) tfNeighbor.addToRightNeighbor(coPhrase.charAt(length)); } else break; } wordTFNeighbor.put(word, tfNeighbor); } } //get left neighbor for(int pIndex = 0; pIndex < leftPTable.size(); pIndex++){ String phrase = leftPTable.get(pIndex); for(int length = 1 + leftLTable[pIndex]; length <= N && length <= phrase.length(); length++){ String word = reverse(phrase.substring(0, length)); TFNeighbor tfNeighbor = wordTFNeighbor.get(word); if(phrase.length() > length) tfNeighbor.addToLeftNeighbor(phrase.charAt(length)); for(int lIndex = pIndex + 1; lIndex < leftLTable.length; lIndex++){ if(leftLTable[lIndex] >= length){ String coPhrase = leftPTable.get(lIndex); if(coPhrase.length() > length) tfNeighbor.addToLeftNeighbor(coPhrase.charAt(length)); } else break; } } } System.out.println("Info: [Nagao Algorithm Step 3]: having counted TF and Neighbor"); } //according to wordTFNeighbor, count MI of word private double countMI(String word){ if(word.length() <= 1) return 0; double coProbability = wordTFNeighbor.get(word).getTF()/wordNumber; List<Double> mi = new ArrayList<Double>(word.length()); for(int pos = 1; pos < word.length(); pos++){ String leftPart = word.substring(0, pos); String rightPart = word.substring(pos); double leftProbability = wordTFNeighbor.get(leftPart).getTF()/wordNumber; double rightProbability = wordTFNeighbor.get(rightPart).getTF()/wordNumber; mi.add(coProbability/(leftProbability*rightProbability)); } return Collections.min(mi); } //save TF, (left and right) neighbor number, neighbor entropy, mutual information private void saveTFNeighborInfoMI(String out, String stopList, String[] threshold){ try { //read stop words file Set<String> stopWords = new HashSet<String>(); BufferedReader br = new BufferedReader(new FileReader(stopList)); String line; while((line = br.readLine()) != null){ if(line.length() > 1) stopWords.add(line); } br.close(); //output words TF, neighbor info, MI BufferedWriter bw = new BufferedWriter(new FileWriter(out)); for(Map.Entry<String, TFNeighbor> entry : wordTFNeighbor.entrySet()){ if( entry.getKey().length() <= 1 || stopWords.contains(entry.getKey()) ) continue; TFNeighbor tfNeighbor = entry.getValue(); int tf, leftNeighborNumber, rightNeighborNumber; double mi; tf = tfNeighbor.getTF(); leftNeighborNumber = tfNeighbor.getLeftNeighborNumber(); rightNeighborNumber = tfNeighbor.getRightNeighborNumber(); mi = countMI(entry.getKey()); if(tf > Integer.parseInt(threshold[0]) && leftNeighborNumber > Integer.parseInt(threshold[1]) && rightNeighborNumber > Integer.parseInt(threshold[2]) && mi > Integer.parseInt(threshold[3]) ){ StringBuilder sb = new StringBuilder(); sb.append(entry.getKey()); sb.append(",").append(tf); sb.append(",").append(leftNeighborNumber); sb.append(",").append(rightNeighborNumber); sb.append(",").append(tfNeighbor.getLeftNeighborEntropy()); sb.append(",").append(tfNeighbor.getRightNeighborEntropy()); sb.append(",").append(mi).append("\n"); bw.write(sb.toString()); } } bw.close(); } catch (IOException e) { throw new RuntimeException(e); } System.out.println("Info: [Nagao Algorithm Step 4]: having saved to file"); } //apply nagao algorithm to input file public static void applyNagao(String[] inputs, String out, String stopList){ NagaoAlgorithm nagao = new NagaoAlgorithm(); //step 1: add phrases to PTable String line; for(String in : inputs){ try { BufferedReader br = new BufferedReader(new FileReader(in)); while((line = br.readLine()) != null){ nagao.addToPTable(line); } br.close(); } catch (IOException e) { throw new RuntimeException(); } } System.out.println("Info: [Nagao Algorithm Step 1]: having added all left and right substrings to PTable"); //step 2: sort PTable and count LTable nagao.countLTable(); //step3: count TF and Neighbor nagao.countTFNeighbor(); //step4: save TF NeighborInfo and MI nagao.saveTFNeighborInfoMI(out, stopList, "20,3,3,5".split(",")); } public static void applyNagao(String[] inputs, String out, String stopList, int n, String filter){ NagaoAlgorithm nagao = new NagaoAlgorithm(); nagao.setN(n); String[] threshold = filter.split(","); if(threshold.length != 4){ System.out.println("ERROR: filter must have 4 numbers, seperated with ',' "); return; } //step 1: add phrases to PTable String line; for(String in : inputs){ try { BufferedReader br = new BufferedReader(new FileReader(in)); while((line = br.readLine()) != null){ nagao.addToPTable(line); } br.close(); } catch (IOException e) { throw new RuntimeException(); } } System.out.println("Info: [Nagao Algorithm Step 1]: having added all left and right substrings to PTable"); //step 2: sort PTable and count LTable nagao.countLTable(); //step3: count TF and Neighbor nagao.countTFNeighbor(); //step4: save TF NeighborInfo and MI nagao.saveTFNeighborInfoMI(out, stopList, threshold); } private void setN(int n){ N = n; } public static void main(String[] args) { String[] ins = {"E://test//ganfen.txt"}; applyNagao(ins, "E://test//out.txt", "E://test//stoplist.txt"); } }
2. TFNeighbor.java
package com.algo.word; import java.util.HashMap; import java.util.Map; public class TFNeighbor { private int tf; private Map<Character, Integer> leftNeighbor; private Map<Character, Integer> rightNeighbor; TFNeighbor(){ leftNeighbor = new HashMap<Character, Integer>(); rightNeighbor = new HashMap<Character, Integer>(); } //add word to leftNeighbor public void addToLeftNeighbor(char word){ //leftNeighbor.put(word, 1 + leftNeighbor.getOrDefault(word, 0)); Integer number = leftNeighbor.get(word); leftNeighbor.put(word, number == null? 1: 1+number); } //add word to rightNeighbor public void addToRightNeighbor(char word){ //rightNeighbor.put(word, 1 + rightNeighbor.getOrDefault(word, 0)); Integer number = rightNeighbor.get(word); rightNeighbor.put(word, number == null? 1: 1+number); } //increment tf public void incrementTF(){ tf++; } public int getLeftNeighborNumber(){ return leftNeighbor.size(); } public int getRightNeighborNumber(){ return rightNeighbor.size(); } public double getLeftNeighborEntropy(){ double entropy = 0; int sum = 0; for(int number : leftNeighbor.values()){ entropy += number*Math.log(number); sum += number; } if(sum == 0) return 0; return Math.log(sum) - entropy/sum; } public double getRightNeighborEntropy(){ double entropy = 0; int sum = 0; for(int number : rightNeighbor.values()){ entropy += number*Math.log(number); sum += number; } if(sum == 0) return 0; return Math.log(sum) - entropy/sum; } public int getTF(){ return tf; } }
3. Main.java
package com.algo.word; public class Main { public static void main(String[] args) { //if 3 arguments, first argument is input files splitting with ',' //second argument is output file //output 7 columns split with ',' , like below: //word, term frequency, left neighbor number, right neighbor number, left neighbor entropy, right neighbor entropy, mutual information //third argument is stop words list if(args.length == 3) NagaoAlgorithm.applyNagao(args[0].split(","), args[1], args[2]); //if 4 arguments, forth argument is the NGram parameter N //5th argument is threshold of output words, default is "20,3,3,5" //output TF > 20 && (left | right) neighbor number > 3 && MI > 5 else if(args.length == 5) NagaoAlgorithm.applyNagao(args[0].split(","), args[1], args[2], Integer.parseInt(args[3]), args[4]); } }
以上所述就是本文的全體內容了,願望年夜家可以或許愛好。