php簡單中文分詞系統結構:首字散列表、Trie索引樹結點優點:分詞中,不需預知待查詢詞的長度,沿樹鏈逐字匹配。缺點:構造和維護比較復雜,單詞樹枝多,浪費了一定的空間
php教程簡單中文分詞系統
結構:首字散列表、trie索引樹結點
優點:分詞中,不需預知待查詢詞的長度,沿樹鏈逐字匹配。
缺點:構造和維護比較復雜,單詞樹枝多,浪費了一定的空間
* @version 0.1
* @todo 構造通用的字典算法,並寫了一個簡易的分詞
* @author [email protected]
* trie字典樹
*
*/
class trie
{
private $trie;function __construct()
{
$trie = array('children' => array(),'isword'=>false);
}/**
* 把詞加入詞典
*
* @param string $key
*/
function &setword($word='')
{
$trienode = &$this->trie;
for($i = 0;$i < strlen($word);$i++)
{
$character = $word[$i];
if(!isset($trienode['children'][$character]))
{
$trienode['children'][$character] = array('isword'=>false);
}
if($i == strlen($word)-1)
{
$trienode['children'][$character] = array('isword'=>true);
}
$trienode = &$trienode['children'][$character];
}
}/**
* 判斷是否為詞典詞
*
* @param string $word
* @return bool true/false
*/
function & isword($word)
{
$trienode = &$this->trie;
for($i = 0;$i < strlen($word);$i++)
{
$character = $word[$i];
if(!isset($trienode['children'][$character]))
{
return false;
}
else
{
//判斷詞結束
if($i == (strlen($word)-1) && $trienode['children'][$character]['isword'] == true)
{
return true;
}
elseif($i == (strlen($word)-1) && $trienode['children'][$character]['isword'] == false)
{
return false;
}
$trienode = &$trienode['children'][$character];
}
}
}
/**
* 在文本$text找詞出現的位置
*
* @param string $text
* @return array array('position'=>$position,'word' =>$word);
*/
function search($text="")
{
$textlen = strlen($text);
$trienode = $tree = $this->trie;
$find = array();
$wordrootposition = 0;//詞根位置
$prenode = false;//回溯參數,當詞典ab,在字符串aab中,需要把$i向前回溯一次
$word = '';
for ($i = 0; $i < $textlen;$i++)
{if(isset($trienode['children'][$text[$i]]))
{
$word = $word .$text[$i];
$trienode = $trienode['children'][$text[$i]];
if($prenode == false)
{
$wordrootposition = $i;
}
$prenode = true;
if($trienode['isword'])
{
$find[] = array('position'=>$wordrootposition,'word' =>$word);
}
}
else
{
$trienode = $tree;
$word = '';
if($prenode)
{
$i = $i -1;
$prenode = false;
}
}
}
return $find;
}
}
1 2