PHP基于字典的中英文数字混合分词算法RMM简易实现
生活随笔
收集整理的這篇文章主要介紹了
PHP基于字典的中英文数字混合分词算法RMM简易实现
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
?
<?phpclass Seg {//字典private $dict = [];//加載字典function set_dict($vDict){//詞典大寫,方便比對foreach ($vDict as $i=>$v){$vDict[$i]= strtoupper($v);}$this->dict = $vDict;}//分詞測試//基于字典的中英文數字混合分詞算法RMM 實現//https://blog.csdn.net/xqhadoop/article/details/60757242function rmmseg($vStr = ''){if('' === $vStr){return [];}if(empty($this->dict)){exit('詞典為空');}//大寫,方便比對$str = strtoupper($vStr);//分詞初步結果$result = [];while ('' !== $str) {$pos = 0;$t_str = $str;while (1) { // echo '$t_str=' . $t_str . ',$pos=' . $pos . '<br>';if (in_array($t_str, $this->dict)) {$result[] = $t_str; // echo ' 在字典中:' . $t_str . '<br>';$t_str = '';} else {$pos++;$t_str = mb_substr($str, $pos); // echo ' 不在字典,剩余 ' . $t_str . '<br>';if (1 == mb_strlen($t_str)) {$result[] = $t_str;$t_str = '';}}if ('' === $t_str) {break;}}if (0 == $pos) {break;}$str = mb_substr($str, 0, $pos);if (1 == $pos && '' !== trim($str)) {$result[] = $str;break;}// echo '循環str=' . $str . print_r($result, 1) . '<br>';}$result = array_reverse($result); // echo '<pre>' . print_r($result, 1);$result_merge_num = [];//連續單個數字或字母同類型合并$last_num = '';//連續字符類型$last_c_type = '';foreach ($result as $word) { // echo print_r($result_merge_num, 1) . '<br><br>$word=' . $word . ' ';if (1 == mb_strlen($word)) {//單個字$c_type = 3; //漢字if ($word >= 'A' && $word <= 'Z') {$c_type = 1; //字母} elseif (is_numeric($word)) {$c_type = 2; //數字} elseif (in_array($word, ['+', '-', '*', '/', '.', '%'])) {$c_type = 2; //數字或字母,隨著前面一個而變} else {$c_type = 0; //其他,例如空格等}//以下是數字或字母if ($c_type == $last_c_type) {//與之前的是同類的$last_num .= $word;} else {//不同類的if ('' !== $last_num) {$result_merge_num[] = $last_num;}$last_num = $word;$last_c_type = $c_type;}// echo 'c_type=' . $c_type . ',last=' . $last_num . '<br>';continue;}//以下是多個字的詞if ('' !== $last_num) {//如果之前有連續的數值,則合并為一個,加入數組$result_merge_num[] = $last_num;$last_num = '';}$result_merge_num[] = $word;// echo 'c_type=' . $c_type . print_r($result_merge_num, 1) . '<br>';}// echo '<pre> 1 $result_merge_num=' . print_r($result_merge_num, 1);if ('' !== $last_num) {$result_merge_num[] = $last_num;}//濾除空格,注意不能用array_filter,否則把0值會去掉foreach ($result_merge_num as $i => $word) {if (' ' === $word) {unset($result_merge_num[$i]);}}// echo '<pre> $result_merge_num=' . print_r($result_merge_num, 1);return $result_merge_num;} }//------------------------------------------------------------------ //測試$seg = new Seg();//詞典 $dict = ['中華', '廣大', '人民', '共和國', '電阻', '電阻值', '貼片', '電壓','精度', 'RC', '功率', 'RES', 'OHM', '0603', '貼片電阻'];$str = "貼片電阻Res0603889電阻值24.89kohm,電壓 25V 功率1/8w放"; $str .= "RC0603FR-0722kL,4.22k精度0.5%,99 88方式"; $str .= "中華人民共和國廣大";$seg->set_dict($dict); $res = $seg->rmmseg($str);echo '原字符串=' . $str . '<br>'; echo '<br/>分詞結果=';echo "<style>.C_HIGHLIGHT{background:#ff0; border:1px solid orange;padding:1px 3px; margin-left:1px ;margin-top:2px;display:inline-block}</style>";foreach ($res as $word) {echo "<span class='C_HIGHLIGHT'>$word</span> "; }?
參考:https://blog.csdn.net/xqhadoop/article/details/60757242
?
總結
以上是生活随笔為你收集整理的PHP基于字典的中英文数字混合分词算法RMM简易实现的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 分享一个自己写的取中国农历相关数据的类。
- 下一篇: PHP基本