You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1137 lines
37 KiB

8 years ago
  1. <?php
  2. /*
  3. * 居于Unicode编码词典的php分词器
  4. * 1、只适用于php5,必要函数 iconv
  5. * 2、本程序是使用RMM逆向匹配算法进行分词的,词库需要特别编译,本类里提供了 MakeDict() 方法
  6. * 3、简单操作流程: SetSource -> StartAnalysis -> Get***Result
  7. * 4、对主词典使用特殊格式进行编码, 不需要载入词典到内存操作
  8. *
  9. * Copyright IT柏拉图 QQ: 2500875 Email: 2500875#qq.com
  10. *
  11. * @version 2.0
  12. *
  13. */
  14. //常量定义
  15. define('_SP_', chr(0xFF).chr(0xFE));
  16. define('UCS2', 'ucs-2be');
  17. class phpAnalysis
  18. {
  19. //hash算法选项
  20. public $mask_value = 0xFFFF;
  21. //输入和输出的字符编码(只允许 utf-8、gbk/gb2312/gb18030、big5 三种类型)
  22. public $sourceCharSet = 'utf-8';
  23. public $targetCharSet = 'utf-8';
  24. //生成的分词结果数据类型 1 为全部, 2为 词典词汇及单个中日韩简繁字符及英文, 3 为词典词汇及英文
  25. public $resultType = 1;
  26. //句子长度小于这个数值时不拆分,notSplitLen = n(个汉字) * 2 + 1
  27. public $notSplitLen = 5;
  28. //把英文单词全部转小写
  29. public $toLower = false;
  30. //使用最大切分模式对二元词进行消岐
  31. public $differMax = false;
  32. //尝试合并单字
  33. public $unitWord = true;
  34. //初始化类时直接加载词典
  35. public static $loadInit = true;
  36. //使用热门词优先模式进行消岐
  37. public $differFreq = false;
  38. //被转换为unicode的源字符串
  39. private $sourceString = '';
  40. //附加词典
  41. public $addonDic = array();
  42. public $addonDicFile = 'dict/words_addons.dic';
  43. //主词典
  44. public $dicStr = '';
  45. public $mainDic = array();
  46. public $mainDicHand = false;
  47. public $mainDicInfos = array();
  48. public $mainDicFile = 'dict/base_dic_full.dic';
  49. //是否直接载入词典(选是载入速度较慢,但解析较快;选否载入较快,但解析较慢,需要时才会载入特定的词条)
  50. private $isLoadAll = false;
  51. //主词典词语最大长度 x / 2
  52. private $dicWordMax = 14;
  53. //粗分后的数组(通常是截取句子等用途)
  54. private $simpleResult = array();
  55. //最终结果(用空格分开的词汇列表)
  56. private $finallyResult = '';
  57. //是否已经载入词典
  58. public $isLoadDic = false;
  59. //系统识别或合并的新词
  60. public $newWords = array();
  61. public $foundWordStr = '';
  62. //词库载入时间
  63. public $loadTime = 0;
  64. /**
  65. * 构造函数
  66. * @param $source_charset
  67. * @param $target_charset
  68. * @param $load_alldic
  69. * @param $source
  70. *
  71. * @return void
  72. */
  73. public function __construct($source_charset='utf-8', $target_charset='utf-8', $load_all=true, $source='')
  74. {
  75. $this->addonDicFile = dirname(__FILE__).'/'.$this->addonDicFile;
  76. $this->mainDicFile = dirname(__FILE__).'/'.$this->mainDicFile;
  77. $this->SetSource( $source, $source_charset, $target_charset );
  78. $this->isLoadAll = $load_all;
  79. if(self::$loadInit) $this->LoadDict();
  80. }
  81. /**
  82. * 析构函数
  83. */
  84. function __destruct()
  85. {
  86. if( $this->mainDicHand !== false )
  87. {
  88. @fclose( $this->mainDicHand );
  89. }
  90. }
  91. /**
  92. * 根据字符串计算key索引
  93. * @param $key
  94. * @return short int
  95. */
  96. private function _get_index( $key )
  97. {
  98. $l = strlen($key);
  99. $h = 0x238f13af;
  100. while ($l--)
  101. {
  102. $h += ($h << 5);
  103. $h ^= ord($key[$l]);
  104. $h &= 0x7fffffff;
  105. }
  106. return ($h % $this->mask_value);
  107. }
  108. /**
  109. * 从文件获得词
  110. * @param $key
  111. * @param $type (类型 word key_groups)
  112. * @return short int
  113. */
  114. public function GetWordInfos( $key, $type='word' )
  115. {
  116. if( !$this->mainDicHand )
  117. {
  118. $this->mainDicHand = fopen($this->mainDicFile, 'r');
  119. }
  120. $p = 0;
  121. $keynum = $this->_get_index( $key );
  122. if( isset($this->mainDicInfos[ $keynum ]) )
  123. {
  124. $data = $this->mainDicInfos[ $keynum ];
  125. }
  126. else
  127. {
  128. //rewind( $this->mainDicHand );
  129. $move_pos = $keynum * 8;
  130. fseek($this->mainDicHand, $move_pos, SEEK_SET);
  131. $dat = fread($this->mainDicHand, 8);
  132. $arr = unpack('I1s/n1l/n1c', $dat);
  133. if( $arr['l'] == 0 )
  134. {
  135. return false;
  136. }
  137. fseek($this->mainDicHand, $arr['s'], SEEK_SET);
  138. $data = @unserialize(fread($this->mainDicHand, $arr['l']));
  139. $this->mainDicInfos[ $keynum ] = $data;
  140. }
  141. if( !is_array($data) || !isset($data[$key]) )
  142. {
  143. return false;
  144. }
  145. return ($type=='word' ? $data[$key] : $data);
  146. }
  147. /**
  148. * 设置源字符串
  149. * @param $source
  150. * @param $source_charset
  151. * @param $target_charset
  152. *
  153. * @return bool
  154. */
  155. public function SetSource( $source, $source_charset='utf-8', $target_charset='utf-8' )
  156. {
  157. $this->sourceCharSet = strtolower($source_charset);
  158. $this->targetCharSet = strtolower($target_charset);
  159. $this->simpleResult = array();
  160. $this->finallyResult = array();
  161. $this->finallyIndex = array();
  162. if( $source != '' )
  163. {
  164. $rs = true;
  165. if( preg_match("/^utf/", $source_charset) ) {
  166. $this->sourceString = iconv('utf-8', UCS2, $source);
  167. }
  168. else if( preg_match("/^gb/", $source_charset) ) {
  169. $this->sourceString = iconv('utf-8', UCS2, iconv('gb18030', 'utf-8', $source));
  170. }
  171. else if( preg_match("/^big/", $source_charset) ) {
  172. $this->sourceString = iconv('utf-8', UCS2, iconv('big5', 'utf-8', $source));
  173. }
  174. else {
  175. $rs = false;
  176. }
  177. }
  178. else
  179. {
  180. $rs = false;
  181. }
  182. return $rs;
  183. }
  184. /**
  185. * 设置结果类型(只在获取finallyResult才有效)
  186. * @param $rstype 1 为全部, 2去除特殊符号
  187. *
  188. * @return void
  189. */
  190. public function SetResultType( $rstype )
  191. {
  192. $this->resultType = $rstype;
  193. }
  194. /**
  195. * 载入词典
  196. *
  197. * @return void
  198. */
  199. public function LoadDict( $maindic='' )
  200. {
  201. $startt = microtime(true);
  202. //正常读取文件
  203. $dicAddon = $this->addonDicFile;
  204. if($maindic=='' || !file_exists($maindic) )
  205. {
  206. $dicWords = $this->mainDicFile ;
  207. }
  208. else
  209. {
  210. $dicWords = $maindic;
  211. $this->mainDicFile = $maindic;
  212. }
  213. //加载主词典(只打开)
  214. $this->mainDicHand = fopen($dicWords, 'r');
  215. //载入副词典
  216. $hw = '';
  217. $ds = file($dicAddon);
  218. foreach($ds as $d)
  219. {
  220. $d = trim($d);
  221. if($d=='') continue;
  222. $estr = substr($d, 1, 1);
  223. if( $estr==':' ) {
  224. $hw = substr($d, 0, 1);
  225. }
  226. else
  227. {
  228. $spstr = _SP_;
  229. $spstr = iconv(UCS2, 'utf-8', $spstr);
  230. $ws = explode(',', $d);
  231. $wall = iconv('utf-8', UCS2, join($spstr, $ws));
  232. $ws = explode(_SP_, $wall);
  233. foreach($ws as $estr)
  234. {
  235. $this->addonDic[$hw][$estr] = strlen($estr);
  236. }
  237. }
  238. }
  239. $this->loadTime = microtime(true) - $startt;
  240. $this->isLoadDic = true;
  241. }
  242. /**
  243. * 检测某个词是否存在
  244. */
  245. public function IsWord( $word )
  246. {
  247. $winfos = $this->GetWordInfos( $word );
  248. return ($winfos !== false);
  249. }
  250. /**
  251. * 获得某个词的词性及词频信息
  252. * @parem $word unicode编码的词
  253. * @return void
  254. */
  255. public function GetWordProperty($word)
  256. {
  257. if( strlen($word)<4 )
  258. {
  259. return '/s';
  260. }
  261. $infos = $this->GetWordInfos($word);
  262. return isset($infos[1]) ? "/{$infos[1]}{$infos[0]}" : "/s";
  263. }
  264. /**
  265. * 指定某词的词性信息(通常是新词)
  266. * @parem $word unicode编码的词
  267. * @parem $infos array('c' => 词频, 'm' => 词性);
  268. * @return void;
  269. */
  270. public function SetWordInfos($word, $infos)
  271. {
  272. if( strlen($word)<4 )
  273. {
  274. return ;
  275. }
  276. if( isset($this->mainDicInfos[$word]) )
  277. {
  278. $this->newWords[$word]++;
  279. $this->mainDicInfos[$word]['c']++;
  280. }
  281. else
  282. {
  283. $this->newWords[$word] = 1;
  284. $this->mainDicInfos[$word] = $infos;
  285. }
  286. }
  287. /**
  288. * 开始执行分析
  289. * @parem bool optimize 是否对结果进行优化
  290. * @return bool
  291. */
  292. public function StartAnalysis($optimize=true)
  293. {
  294. if( !$this->isLoadDic )
  295. {
  296. $this->LoadDict();
  297. }
  298. $this->simpleResult = $this->finallyResult = array();
  299. $this->sourceString .= chr(0).chr(32);
  300. $slen = strlen($this->sourceString);
  301. $sbcArr = array();
  302. $j = 0;
  303. //全角与半角字符对照表
  304. for($i=0xFF00; $i < 0xFF5F; $i++)
  305. {
  306. $scb = 0x20 + $j;
  307. $j++;
  308. $sbcArr[$i] = $scb;
  309. }
  310. //对字符串进行粗分
  311. $onstr = '';
  312. $lastc = 1; //1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符
  313. $s = 0;
  314. $ansiWordMatch = "[0-9a-z@#%\+\.-]";
  315. $notNumberMatch = "[a-z@#%\+]";
  316. for($i=0; $i < $slen; $i++)
  317. {
  318. $c = $this->sourceString[$i].$this->sourceString[++$i];
  319. $cn = hexdec(bin2hex($c));
  320. $cn = isset($sbcArr[$cn]) ? $sbcArr[$cn] : $cn;
  321. //ANSI字符
  322. if($cn < 0x80)
  323. {
  324. if( preg_match('/'.$ansiWordMatch.'/i', chr($cn)) )
  325. {
  326. if( $lastc != 2 && $onstr != '') {
  327. $this->simpleResult[$s]['w'] = $onstr;
  328. $this->simpleResult[$s]['t'] = $lastc;
  329. $this->_deep_analysis($onstr, $lastc, $s, $optimize);
  330. $s++;
  331. $onstr = '';
  332. }
  333. $lastc = 2;
  334. $onstr .= chr(0).chr($cn);
  335. }
  336. else
  337. {
  338. if( $onstr != '' )
  339. {
  340. $this->simpleResult[$s]['w'] = $onstr;
  341. if( $lastc==2 )
  342. {
  343. if( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;
  344. }
  345. $this->simpleResult[$s]['t'] = $lastc;
  346. if( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
  347. $s++;
  348. }
  349. $onstr = '';
  350. $lastc = 3;
  351. if($cn < 31)
  352. {
  353. continue;
  354. }
  355. else
  356. {
  357. $this->simpleResult[$s]['w'] = chr(0).chr($cn);
  358. $this->simpleResult[$s]['t'] = 3;
  359. $s++;
  360. }
  361. }
  362. }
  363. //普通字符
  364. else
  365. {
  366. //正常文字
  367. if( ($cn>0x3FFF && $cn < 0x9FA6) || ($cn>0xF8FF && $cn < 0xFA2D)
  368. || ($cn>0xABFF && $cn < 0xD7A4) || ($cn>0x3040 && $cn < 0x312B) )
  369. {
  370. if( $lastc != 1 && $onstr != '')
  371. {
  372. $this->simpleResult[$s]['w'] = $onstr;
  373. if( $lastc==2 )
  374. {
  375. if( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;
  376. }
  377. $this->simpleResult[$s]['t'] = $lastc;
  378. if( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
  379. $s++;
  380. $onstr = '';
  381. }
  382. $lastc = 1;
  383. $onstr .= $c;
  384. }
  385. //特殊符号
  386. else
  387. {
  388. if( $onstr != '' )
  389. {
  390. $this->simpleResult[$s]['w'] = $onstr;
  391. if( $lastc==2 )
  392. {
  393. if( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;
  394. }
  395. $this->simpleResult[$s]['t'] = $lastc;
  396. if( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
  397. $s++;
  398. }
  399. //检测书名
  400. if( $cn == 0x300A )
  401. {
  402. $tmpw = '';
  403. $n = 1;
  404. $isok = false;
  405. $ew = chr(0x30).chr(0x0B);
  406. while(true)
  407. {
  408. if( !isset($this->sourceString[$i+$n+1]) ) break;
  409. $w = $this->sourceString[$i+$n].$this->sourceString[$i+$n+1];
  410. if( $w == $ew )
  411. {
  412. $this->simpleResult[$s]['w'] = $c;
  413. $this->simpleResult[$s]['t'] = 5;
  414. $s++;
  415. $this->simpleResult[$s]['w'] = $tmpw;
  416. $this->newWords[$tmpw] = 1;
  417. if( !isset($this->newWords[$tmpw]) )
  418. {
  419. $this->foundWordStr .= $this->_out_string_encoding($tmpw).'/nb, ';
  420. $this->SetWordInfos($tmpw, array('c'=>1, 'm'=>'nb'));
  421. }
  422. $this->simpleResult[$s]['t'] = 13;
  423. $s++;
  424. //最大切分模式对书名继续分词
  425. if( $this->differMax )
  426. {
  427. $this->simpleResult[$s]['w'] = $tmpw;
  428. $this->simpleResult[$s]['t'] = 21;
  429. $this->_deep_analysis($tmpw, $lastc, $s, $optimize);
  430. $s++;
  431. }
  432. $this->simpleResult[$s]['w'] = $ew;
  433. $this->simpleResult[$s]['t'] = 5;
  434. $s++;
  435. $i = $i + $n + 1;
  436. $isok = true;
  437. $onstr = '';
  438. $lastc = 5;
  439. break;
  440. }
  441. else
  442. {
  443. $n = $n+2;
  444. $tmpw .= $w;
  445. if( strlen($tmpw) > 60 )
  446. {
  447. break;
  448. }
  449. }
  450. }//while
  451. if( !$isok )
  452. {
  453. $this->simpleResult[$s]['w'] = $c;
  454. $this->simpleResult[$s]['t'] = 5;
  455. $s++;
  456. $onstr = '';
  457. $lastc = 5;
  458. }
  459. continue;
  460. }
  461. $onstr = '';
  462. $lastc = 5;
  463. if( $cn==0x3000 )
  464. {
  465. continue;
  466. }
  467. else
  468. {
  469. $this->simpleResult[$s]['w'] = $c;
  470. $this->simpleResult[$s]['t'] = 5;
  471. $s++;
  472. }
  473. }//2byte symbol
  474. }//end 2byte char
  475. }//end for
  476. //处理分词后的结果
  477. $this->_sort_finally_result();
  478. }
  479. /**
  480. * 深入分词
  481. * @parem $str
  482. * @parem $ctype (2 英文类, 3 //日文类)
  483. * @parem $spos 当前粗分结果游标
  484. * @return bool
  485. */
  486. private function _deep_analysis( &$str, $ctype, $spos, $optimize=true )
  487. {
  488. //中文句子
  489. if( $ctype==1 )
  490. {
  491. $slen = strlen($str);
  492. //小于系统配置分词要求长度的句子
  493. if( $slen < $this->notSplitLen )
  494. {
  495. $tmpstr = '';
  496. $lastType = 0;
  497. if( $spos > 0 ) $lastType = $this->simpleResult[$spos-1]['t'];
  498. if($slen < 5)
  499. {
  500. //echo iconv(UCS2, 'utf-8', $str).'<br/>';
  501. if( $lastType==4 && ( isset($this->addonDic['u'][$str]) || isset($this->addonDic['u'][substr($str, 0, 2)]) ) )
  502. {
  503. $str2 = '';
  504. if( !isset($this->addonDic['u'][$str]) && isset($this->addonDic['s'][substr($str, 2, 2)]) )
  505. {
  506. $str2 = substr($str, 2, 2);
  507. $str = substr($str, 0, 2);
  508. }
  509. $ww = $this->simpleResult[$spos - 1]['w'].$str;
  510. $this->simpleResult[$spos - 1]['w'] = $ww;
  511. $this->simpleResult[$spos - 1]['t'] = 4;
  512. if( !isset($this->newWords[$this->simpleResult[$spos - 1]['w']]) )
  513. {
  514. $this->foundWordStr .= $this->_out_string_encoding( $ww ).'/mu, ';
  515. $this->SetWordInfos($ww, array('c'=>1, 'm'=>'mu'));
  516. }
  517. $this->simpleResult[$spos]['w'] = '';
  518. if( $str2 != '' )
  519. {
  520. $this->finallyResult[$spos-1][] = $ww;
  521. $this->finallyResult[$spos-1][] = $str2;
  522. }
  523. }
  524. else {
  525. $this->finallyResult[$spos][] = $str;
  526. }
  527. }
  528. else
  529. {
  530. $this->_deep_analysis_cn( $str, $ctype, $spos, $slen, $optimize );
  531. }
  532. }
  533. //正常长度的句子,循环进行分词处理
  534. else
  535. {
  536. $this->_deep_analysis_cn( $str, $ctype, $spos, $slen, $optimize );
  537. }
  538. }
  539. //英文句子,转为小写
  540. else
  541. {
  542. if( $this->toLower ) {
  543. $this->finallyResult[$spos][] = strtolower($str);
  544. }
  545. else {
  546. $this->finallyResult[$spos][] = $str;
  547. }
  548. }
  549. }
  550. /**
  551. * 中文的深入分词
  552. * @parem $str
  553. * @return void
  554. */
  555. private function _deep_analysis_cn( &$str, $lastec, $spos, $slen, $optimize=true )
  556. {
  557. $quote1 = chr(0x20).chr(0x1C);
  558. $tmparr = array();
  559. $hasw = 0;
  560. //如果前一个词为 “ , 并且字符串小于3个字符当成一个词处理。
  561. if( $spos > 0 && $slen < 11 && $this->simpleResult[$spos-1]['w']==$quote1 )
  562. {
  563. $tmparr[] = $str;
  564. if( !isset($this->newWords[$str]) )
  565. {
  566. $this->foundWordStr .= $this->_out_string_encoding($str).'/nq, ';
  567. $this->SetWordInfos($str, array('c'=>1, 'm'=>'nq'));
  568. }
  569. if( !$this->differMax )
  570. {
  571. $this->finallyResult[$spos][] = $str;
  572. return ;
  573. }
  574. }
  575. //进行切分
  576. for($i=$slen-1; $i > 0; $i -= 2)
  577. {
  578. //单个词
  579. $nc = $str[$i-1].$str[$i];
  580. //是否已经到最后两个字
  581. if( $i <= 2 )
  582. {
  583. $tmparr[] = $nc;
  584. $i = 0;
  585. break;
  586. }
  587. $isok = false;
  588. $i = $i + 1;
  589. for($k=$this->dicWordMax; $k>1; $k=$k-2)
  590. {
  591. if($i < $k) continue;
  592. $w = substr($str, $i-$k, $k);
  593. if( strlen($w) <= 2 )
  594. {
  595. $i = $i - 1;
  596. break;
  597. }
  598. if( $this->IsWord( $w ) )
  599. {
  600. $tmparr[] = $w;
  601. $i = $i - $k + 1;
  602. $isok = true;
  603. break;
  604. }
  605. }
  606. //echo '<hr />';
  607. //没适合词
  608. if(!$isok) $tmparr[] = $nc;
  609. }
  610. $wcount = count($tmparr);
  611. if( $wcount==0 ) return ;
  612. $this->finallyResult[$spos] = array_reverse($tmparr);
  613. //优化结果(岐义处理、新词、数词、人名识别等)
  614. if( $optimize )
  615. {
  616. $this->_optimize_result( $this->finallyResult[$spos], $spos );
  617. }
  618. }
  619. /**
  620. * 对最终分词结果进行优化(把simpleresult结果合并,并尝试新词识别、数词合并等)
  621. * @parem $optimize 是否优化合并的结果
  622. * @return bool
  623. */
  624. //t = 1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符
  625. private function _optimize_result( &$smarr, $spos )
  626. {
  627. $newarr = array();
  628. $prePos = $spos - 1;
  629. $arlen = count($smarr);
  630. $i = $j = 0;
  631. //检测数量词
  632. if( $prePos > -1 && !isset($this->finallyResult[$prePos]) )
  633. {
  634. $lastw = $this->simpleResult[$prePos]['w'];
  635. $lastt = $this->simpleResult[$prePos]['t'];
  636. if( ($lastt==4 || isset( $this->addonDic['c'][$lastw] )) && isset( $this->addonDic['u'][$smarr[0]] ) )
  637. {
  638. $this->simpleResult[$prePos]['w'] = $lastw.$smarr[0];
  639. $this->simpleResult[$prePos]['t'] = 4;
  640. if( !isset($this->newWords[ $this->simpleResult[$prePos]['w'] ]) )
  641. {
  642. $this->foundWordStr .= $this->_out_string_encoding( $this->simpleResult[$prePos]['w'] ).'/mu, ';
  643. $this->SetWordInfos($this->simpleResult[$prePos]['w'], array('c'=>1, 'm'=>'mu'));
  644. }
  645. $smarr[0] = '';
  646. $i++;
  647. }
  648. }
  649. for(; $i < $arlen; $i++)
  650. {
  651. if( !isset( $smarr[$i+1] ) )
  652. {
  653. $newarr[$j] = $smarr[$i];
  654. break;
  655. }
  656. $cw = $smarr[$i];
  657. $nw = $smarr[$i+1];
  658. $ischeck = false;
  659. //检测数量词
  660. if( isset( $this->addonDic['c'][$cw] ) && isset( $this->addonDic['u'][$nw] ) )
  661. {
  662. //最大切分时保留合并前的词
  663. if($this->differMax)
  664. {
  665. $newarr[$j] = chr(0).chr(0x28);
  666. $j++;
  667. $newarr[$j] = $cw;
  668. $j++;
  669. $newarr[$j] = $nw;
  670. $j++;
  671. $newarr[$j] = chr(0).chr(0x29);
  672. $j++;
  673. }
  674. $newarr[$j] = $cw.$nw;
  675. if( !isset($this->newWords[$newarr[$j]]) )
  676. {
  677. $this->foundWordStr .= $this->_out_string_encoding( $newarr[$j] ).'/mu, ';
  678. $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'mu'));
  679. }
  680. $j++; $i++; $ischeck = true;
  681. }
  682. //检测前导词(通常是姓)
  683. else if( isset( $this->addonDic['n'][ $smarr[$i] ] ) )
  684. {
  685. $is_rs = false;
  686. //词语是副词或介词或频率很高的词不作为人名
  687. if( strlen($nw)==4 )
  688. {
  689. $winfos = $this->GetWordInfos($nw);
  690. if(isset($winfos['m']) && ($winfos['m']=='r' || $winfos['m']=='c' || $winfos['c']>500) )
  691. {
  692. $is_rs = true;
  693. }
  694. }
  695. if( !isset($this->addonDic['s'][$nw]) && strlen($nw)<5 && !$is_rs )
  696. {
  697. $newarr[$j] = $cw.$nw;
  698. //echo iconv(UCS2, 'utf-8', $newarr[$j])."<br />";
  699. //尝试检测第三个词
  700. if( strlen($nw)==2 && isset($smarr[$i+2]) && strlen($smarr[$i+2])==2 && !isset( $this->addonDic['s'][$smarr[$i+2]] ) )
  701. {
  702. $newarr[$j] .= $smarr[$i+2];
  703. $i++;
  704. }
  705. if( !isset($this->newWords[$newarr[$j]]) )
  706. {
  707. $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'nr'));
  708. $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]).'/nr, ';
  709. }
  710. //为了防止错误,保留合并前的姓名
  711. if(strlen($nw)==4)
  712. {
  713. $j++;
  714. $newarr[$j] = chr(0).chr(0x28);
  715. $j++;
  716. $newarr[$j] = $cw;
  717. $j++;
  718. $newarr[$j] = $nw;
  719. $j++;
  720. $newarr[$j] = chr(0).chr(0x29);
  721. }
  722. $j++; $i++; $ischeck = true;
  723. }
  724. }
  725. //检测后缀词(地名等)
  726. else if( isset($this->addonDic['a'][$nw]) )
  727. {
  728. $is_rs = false;
  729. //词语是副词或介词不作为前缀
  730. if( strlen($cw)>2 )
  731. {
  732. $winfos = $this->GetWordInfos($cw);
  733. if(isset($winfos['m']) && ($winfos['m']=='a' || $winfos['m']=='r' || $winfos['m']=='c' || $winfos['c']>500) )
  734. {
  735. $is_rs = true;
  736. }
  737. }
  738. if( !isset($this->addonDic['s'][$cw]) && !$is_rs )
  739. {
  740. $newarr[$j] = $cw.$nw;
  741. if( !isset($this->newWords[$newarr[$j]]) )
  742. {
  743. $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]).'/na, ';
  744. $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'na'));
  745. }
  746. $i++; $j++; $ischeck = true;
  747. }
  748. }
  749. //新词识别(暂无规则)
  750. else if($this->unitWord)
  751. {
  752. if(strlen($cw)==2 && strlen($nw)==2
  753. && !isset($this->addonDic['s'][$cw]) && !isset($this->addonDic['t'][$cw]) && !isset($this->addonDic['a'][$cw])
  754. && !isset($this->addonDic['s'][$nw]) && !isset($this->addonDic['c'][$nw]))
  755. {
  756. $newarr[$j] = $cw.$nw;
  757. //尝试检测第三个词
  758. if( isset($smarr[$i+2]) && strlen($smarr[$i+2])==2 && (isset( $this->addonDic['a'][$smarr[$i+2]] ) || isset( $this->addonDic['u'][$smarr[$i+2]] )) )
  759. {
  760. $newarr[$j] .= $smarr[$i+2];
  761. $i++;
  762. }
  763. if( !isset($this->newWords[$newarr[$j]]) )
  764. {
  765. $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]).'/ms, ';
  766. $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'ms'));
  767. }
  768. $i++; $j++; $ischeck = true;
  769. }
  770. }
  771. //不符合规则
  772. if( !$ischeck )
  773. {
  774. $newarr[$j] = $cw;
  775. //二元消岐处理——最大切分模式
  776. if( $this->differMax && !isset($this->addonDic['s'][$cw]) && strlen($cw) < 5 && strlen($nw) < 7)
  777. {
  778. $slen = strlen($nw);
  779. $hasDiff = false;
  780. for($y=2; $y <= $slen-2; $y=$y+2)
  781. {
  782. $nhead = substr($nw, $y-2, 2);
  783. $nfont = $cw.substr($nw, 0, $y-2);
  784. if( $this->IsWord( $nfont.$nhead ) )
  785. {
  786. if( strlen($cw) > 2 ) $j++;
  787. $hasDiff = true;
  788. $newarr[$j] = $nfont.$nhead;
  789. }
  790. }
  791. }
  792. $j++;
  793. }
  794. }//end for
  795. $smarr = $newarr;
  796. }
  797. /**
  798. * 转换最终分词结果到 finallyResult 数组
  799. * @return void
  800. */
  801. private function _sort_finally_result()
  802. {
  803. $newarr = array();
  804. $i = 0;
  805. foreach($this->simpleResult as $k=>$v)
  806. {
  807. if( empty($v['w']) ) continue;
  808. if( isset($this->finallyResult[$k]) && count($this->finallyResult[$k]) > 0 )
  809. {
  810. foreach($this->finallyResult[$k] as $w)
  811. {
  812. if(!empty($w))
  813. {
  814. $newarr[$i]['w'] = $w;
  815. $newarr[$i]['t'] = 20;
  816. $i++;
  817. }
  818. }
  819. }
  820. else if($v['t'] != 21)
  821. {
  822. $newarr[$i]['w'] = $v['w'];
  823. $newarr[$i]['t'] = $v['t'];
  824. $i++;
  825. }
  826. }
  827. $this->finallyResult = $newarr;
  828. $newarr = '';
  829. }
  830. /**
  831. * 把uncode字符串转换为输出字符串
  832. * @parem str
  833. * return string
  834. */
  835. private function _out_string_encoding( &$str )
  836. {
  837. $rsc = $this->_source_result_charset();
  838. if( $rsc==1 ) {
  839. $rsstr = iconv(UCS2, 'utf-8', $str);
  840. }
  841. else if( $rsc==2 ) {
  842. $rsstr = iconv('utf-8', 'gb18030', iconv(UCS2, 'utf-8', $str) );
  843. }
  844. else{
  845. $rsstr = iconv('utf-8', 'big5', iconv(UCS2, 'utf-8', $str) );
  846. }
  847. return $rsstr;
  848. }
  849. /**
  850. * 获取最终结果字符串(用空格分开后的分词结果)
  851. * @return string
  852. */
  853. public function GetFinallyResult($spword=' ', $word_meanings=false)
  854. {
  855. $rsstr = '';
  856. foreach($this->finallyResult as $v)
  857. {
  858. if( $this->resultType==2 && ($v['t']==3 || $v['t']==5) )
  859. {
  860. continue;
  861. }
  862. $m = '';
  863. if( $word_meanings )
  864. {
  865. $m = $this->GetWordProperty($v['w']);
  866. }
  867. $w = $this->_out_string_encoding($v['w']);
  868. if( $w != ' ' )
  869. {
  870. if($word_meanings) {
  871. $rsstr .= $spword.$w.$m;
  872. }
  873. else {
  874. $rsstr .= $spword.$w;
  875. }
  876. }
  877. }
  878. return $rsstr;
  879. }
  880. /**
  881. * 获取粗分结果,不包含粗分属性
  882. * @return array()
  883. */
  884. public function GetSimpleResult()
  885. {
  886. $rearr = array();
  887. foreach($this->simpleResult as $k=>$v)
  888. {
  889. if( empty($v['w']) ) continue;
  890. $w = $this->_out_string_encoding($v['w']);
  891. if( $w != ' ' ) $rearr[] = $w;
  892. }
  893. return $rearr;
  894. }
  895. /**
  896. * 获取粗分结果,包含粗分属性(1中文词句、2 ANSI词汇(包括全角),3 ANSI标点符号(包括全角),4数字(包括全角),5 中文标点或无法识别字符)
  897. * @return array()
  898. */
  899. public function GetSimpleResultAll()
  900. {
  901. $rearr = array();
  902. foreach($this->simpleResult as $k=>$v)
  903. {
  904. $w = $this->_out_string_encoding($v['w']);
  905. if( $w != ' ' )
  906. {
  907. $rearr[$k]['w'] = $w;
  908. $rearr[$k]['t'] = $v['t'];
  909. }
  910. }
  911. return $rearr;
  912. }
  913. /**
  914. * 获取索引hash数组
  915. * @return array('word'=>count,...)
  916. */
  917. public function GetFinallyIndex()
  918. {
  919. $rearr = array();
  920. foreach($this->finallyResult as $v)
  921. {
  922. if( $this->resultType==2 && ($v['t']==3 || $v['t']==5) )
  923. {
  924. continue;
  925. }
  926. $w = $this->_out_string_encoding($v['w']);
  927. if( $w == ' ' )
  928. {
  929. continue;
  930. }
  931. if( isset($rearr[$w]) )
  932. {
  933. $rearr[$w]++;
  934. }
  935. else
  936. {
  937. $rearr[$w] = 1;
  938. }
  939. }
  940. arsort( $rearr );
  941. return $rearr;
  942. }
  943. /**
  944. * 获取最终关键字(返回用 "," 间隔的关键字)
  945. * @return string
  946. */
  947. public function GetFinallyKeywords( $num = 10 )
  948. {
  949. $n = 0;
  950. $arr = $this->GetFinallyIndex();
  951. $okstr = '';
  952. foreach( $arr as $k => $v )
  953. {
  954. //排除长度为1的词
  955. if( strlen($k)==1 ) {
  956. continue;
  957. }
  958. //排除长度为2的非英文词
  959. elseif( strlen($k)==2 && preg_match('/[^0-9a-zA-Z]/', $k) ) {
  960. continue;
  961. }
  962. //排除单个中文字
  963. elseif( strlen($k) < 4 && !preg_match('/[a-zA-Z]/', $k)) {
  964. continue;
  965. }
  966. $okstr .= ($okstr=='' ? $k : ','.$k);
  967. $n++;
  968. if( $n > $num ) break;
  969. }
  970. return $okstr;
  971. }
  972. /**
  973. * 获得保存目标编码
  974. * @return int
  975. */
  976. private function _source_result_charset()
  977. {
  978. if( preg_match("/^utf/", $this->targetCharSet) ) {
  979. $rs = 1;
  980. }
  981. else if( preg_match("/^gb/", $this->targetCharSet) ) {
  982. $rs = 2;
  983. }
  984. else if( preg_match("/^big/", $this->targetCharSet) ) {
  985. $rs = 3;
  986. }
  987. else {
  988. $rs = 4;
  989. }
  990. return $rs;
  991. }
  992. /**
  993. * 编译词典
  994. * @parem $sourcefile utf-8编码的文本词典数据文件<参见范例dict/not-build/base_dic_full.txt>
  995. * 注意, 需要PHP开放足够的内存才能完成操作
  996. * @return void
  997. */
  998. public function MakeDict( $source_file, $target_file='' )
  999. {
  1000. $target_file = ($target_file=='' ? $this->mainDicFile : $target_file);
  1001. $allk = array();
  1002. $fp = fopen($source_file, 'r');
  1003. while( $line = fgets($fp, 512) )
  1004. {
  1005. if( $line[0]=='@' ) continue;
  1006. list($w, $r, $a) = explode(',', $line);
  1007. $a = trim( $a );
  1008. $w = iconv('utf-8', UCS2, $w);
  1009. $k = $this->_get_index( $w );
  1010. if( isset($allk[ $k ]) )
  1011. $allk[ $k ][ $w ] = array($r, $a);
  1012. else
  1013. $allk[ $k ][ $w ] = array($r, $a);
  1014. }
  1015. fclose( $fp );
  1016. $fp = fopen($target_file, 'w');
  1017. $heade_rarr = array();
  1018. $alldat = '';
  1019. $start_pos = $this->mask_value * 8;
  1020. foreach( $allk as $k => $v )
  1021. {
  1022. $dat = serialize( $v );
  1023. $dlen = strlen($dat);
  1024. $alldat .= $dat;
  1025. $heade_rarr[ $k ][0] = $start_pos;
  1026. $heade_rarr[ $k ][1] = $dlen;
  1027. $heade_rarr[ $k ][2] = count( $v );
  1028. $start_pos += $dlen;
  1029. }
  1030. unset( $allk );
  1031. for($i=0; $i < $this->mask_value; $i++)
  1032. {
  1033. if( !isset($heade_rarr[$i]) )
  1034. {
  1035. $heade_rarr[$i] = array(0, 0, 0);
  1036. }
  1037. fwrite($fp, pack("Inn", $heade_rarr[$i][0], $heade_rarr[$i][1], $heade_rarr[$i][2]));
  1038. }
  1039. fwrite( $fp, $alldat);
  1040. fclose( $fp );
  1041. }
  1042. /**
  1043. * 导出词典的词条
  1044. * @parem $targetfile 保存位置
  1045. * @return void
  1046. */
  1047. public function ExportDict( $targetfile )
  1048. {
  1049. if( !$this->mainDicHand )
  1050. {
  1051. $this->mainDicHand = fopen($this->mainDicFile, 'r');
  1052. }
  1053. $fp = fopen($targetfile, 'w');
  1054. for($i=0; $i <= $this->mask_value; $i++)
  1055. {
  1056. $move_pos = $i * 8;
  1057. fseek($this->mainDicHand, $move_pos, SEEK_SET);
  1058. $dat = fread($this->mainDicHand, 8);
  1059. $arr = unpack('I1s/n1l/n1c', $dat);
  1060. if( $arr['l'] == 0 )
  1061. {
  1062. continue;
  1063. }
  1064. fseek($this->mainDicHand, $arr['s'], SEEK_SET);
  1065. $data = @unserialize(fread($this->mainDicHand, $arr['l']));
  1066. if( !is_array($data) ) continue;
  1067. foreach($data as $k => $v)
  1068. {
  1069. $w = iconv(UCS2, 'utf-8', $k);
  1070. fwrite($fp, "{$w},{$v[0]},{$v[1]}\n");
  1071. }
  1072. }
  1073. fclose( $fp );
  1074. return true;
  1075. }
  1076. }
  1077. ?>