日韩性视频-久久久蜜桃-www中文字幕-在线中文字幕av-亚洲欧美一区二区三区四区-撸久久-香蕉视频一区-久久无码精品丰满人妻-国产高潮av-激情福利社-日韩av网址大全-国产精品久久999-日本五十路在线-性欧美在线-久久99精品波多结衣一区-男女午夜免费视频-黑人极品ⅴideos精品欧美棵-人人妻人人澡人人爽精品欧美一区-日韩一区在线看-欧美a级在线免费观看

歡迎訪問(wèn) 生活随笔!

生活随笔

當(dāng)前位置: 首頁(yè) > 编程资源 > 编程问答 >内容正文

编程问答

java关键字匹配算法_简单关键词匹配算法

發(fā)布時(shí)間:2024/8/1 编程问答 24 豆豆
生活随笔 收集整理的這篇文章主要介紹了 java关键字匹配算法_简单关键词匹配算法 小編覺得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.

針對(duì)微博的短篇博文,編寫的簡(jiǎn)單分詞和匹配算法。相對(duì)于一篇文檔的復(fù)雜分詞算法,能夠在效率和可用性上得到較好的平衡。

package com.sina.tblog.sentiment;

import java.io.BufferedReader;

import java.io.File;

import java.io.FileOutputStream;

import java.io.FileReader;

import java.io.IOException;

import java.io.OutputStreamWriter;

import java.util.ArrayList;

import java.util.HashSet;

import java.util.List;

import java.util.regex.Pattern;

import com.sina.tblog.sentiment.constant.Constant;

public class KeyWordFilter {

public static HashSet KeyWordsList = null;

public static HashSet letterKeyWordsList = null;

/**

* 初始化或重新導(dǎo)入關(guān)鍵詞列表

* @throws IOException

*/

static{

try {

initKeyWords(Constant.KeyWordsFiles);

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

}

public static int deleteNewWord(String word){

if(word.length()>10||word.length()<2)

return -1;

if(!KeyWordsList.contains(word))

return 0;

KeyWordsList.remove(word);

if(Pattern.compile("(?i)[a-z][A-Z]").matcher(word).find())

letterKeyWordsList.remove(word.toUpperCase());

FileOutputStream stream;

OutputStreamWriter writer;

try {

stream = new FileOutputStream(Constant.newWordsFile,true);

writer = new OutputStreamWriter(stream);

writer.write("\n"+word);

writer.close();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

return -1;

}

return 1;

}

public static int addWord(String word){

if(word.length()>10)

return -1;

if(KeyWordsList.contains(word))

return 0;

KeyWordsList.add(word);

if(Pattern.compile("(?i)[a-z][A-Z]").matcher(word).find())

letterKeyWordsList.add(word.toUpperCase());

FileOutputStream stream;

OutputStreamWriter writer;

try {

stream = new FileOutputStream(Constant.newWordsFile,true);

writer = new OutputStreamWriter(stream);

writer.write("\n"+word);

writer.close();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

return -1;

}

return 1;

}

private static void initKeyWords(String Files[]) throws IOException {

if(KeyWordsList!=null)

KeyWordsList.clear();

else

KeyWordsList = new HashSet();

if(letterKeyWordsList!=null)

letterKeyWordsList.clear();

else

letterKeyWordsList = new HashSet();

for(int i=0;i

File file = new File(Files[i]);

BufferedReader reader = null;

reader = new BufferedReader(new FileReader(file));

String tmp = reader.readLine();

while(tmp!=null){

KeyWordsList.add(tmp);

if(Pattern.compile("(?i)[a-z][A-Z]").matcher(tmp).find())

letterKeyWordsList.add(tmp.toUpperCase());

tmp = reader.readLine();

}

reader.close();

}

}

private static boolean findWord(String str,boolean ignoreCase){

if(ignoreCase == false)

return KeyWordsList.contains(str);

else{

boolean match = KeyWordsList.contains(str);

if(match == false){

match = letterKeyWordsList.contains(str.toUpperCase());

}

return match;

}

}

public static List segmentStrQuickMatch( String str_line,boolean ignoreCase)

{

String term = "";

boolean term_tag = false;

int str_size=0,left=0,len=0;

List list = new ArrayList();

str_size = str_line.length();

while(left

{

len = Constant.max_len;

while( len>=Constant.min_len )//gkm:每一詞

{

term="";

int right = left+len;

int x = 0;

if(right>str_size){

x = right-str_size;

right = str_size;

}

term=str_line.substring(left,right);

term_tag=findWord(term,ignoreCase);

if(term_tag==true)

break;

if(x>0)

len-=x+1;

else

len-=1;

}

if(term_tag==false)//gkm:詞典中沒有term,后移一個(gè)字符(以一個(gè)字符的速度后移,使得可以分出中英混合的詞,沒有判斷無(wú)效字符,有待改進(jìn)!!! )

{

left+=1;

}

else//gkm:詞典中有term,后移len個(gè)字符,term加入到terms_vct[term_tag]

{

left+=len;

list.add(term);

}

}//while(left

return list;

}

public static List segmentStrFullMatch( String str_line,boolean ignoreCase)

{

String term = "";

boolean term_tag = false;

int str_size=0,left=0,len=0;

List list = new ArrayList();

str_size = str_line.length();

while(left

{

len = Constant.max_len;

while( len>=Constant.min_len )//gkm:每一詞

{

term="";

int right = left+len;

int x = 0;

if(right>str_size){

x = right-str_size;

right = str_size;

}

term=str_line.substring(left,right);

term_tag=findWord(term,ignoreCase);

if(term_tag==true)

list.add(term);

if(x>0)

len-=x+1;

else

len-=1;

}

left+=1;

}//while(left

return list;

}

public static void main(String[] args) throws IOException {

System.out.println(segmentStrFullMatch("中華人民共和國(guó)",true));

}

}

分享到:

2012-12-18 15:17

瀏覽 504

評(píng)論

總結(jié)

以上是生活随笔為你收集整理的java关键字匹配算法_简单关键词匹配算法的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。

如果覺得生活随笔網(wǎng)站內(nèi)容還不錯(cuò),歡迎將生活随笔推薦給好友。