生活随笔
收集整理的這篇文章主要介紹了
java 去除敏感词
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
DFA 方法
private CSLogger logger = Loggers.getLogger(SensitiveWordDAFService.class);private static Map sensitiveWordMap=new HashMap();public static int minMatchTYpe = 1; //最小匹配規則public static int maxMatchType = 2; //最大匹配規則public SensitiveWordDAFService() {//添加敏感詞long start = System.currentTimeMillis();SensitiveWordUtil sensitiveWordUtil=new SensitiveWordUtil();addSensitiveWordToHashMap(sensitiveWordUtil.getSensitiveWordSet());logger.info("敏感詞加載時間:"+(System.currentTimeMillis()-start)+"ms");}/*** DFA 算法* @param keyWordSet 敏感詞庫*/public static void addSensitiveWordToHashMap(Set<String> keyWordSet) {if(keyWordSet==null){return;}sensitiveWordMap = new HashMap(keyWordSet.size()); //初始化敏感詞容器,減少擴容操作String key = null;Map nowMap = null;Map<String, String> newWorMap = null;//迭代keyWordSetIterator<String> iterator = keyWordSet.iterator();while(iterator.hasNext()){key = iterator.next(); //關鍵字nowMap = sensitiveWordMap;for(int i = 0 ; i < key.length() ; i++){char keyChar = key.charAt(i); //轉換成char型Object wordMap = nowMap.get(keyChar); //獲取if(wordMap != null){ //如果存在該key,直接賦值nowMap = (Map) wordMap;}else{ //不存在則,則構建一個map,同時將isEnd設置為0,因為他不是最后一個newWorMap = new HashMap<String,String>();newWorMap.put("isEnd", "0"); //不是最后一個nowMap.put(keyChar, newWorMap);nowMap = newWorMap;}if(i == key.length() - 1){nowMap.put("isEnd", "1"); //最后一個}}}}/*** 檢查文字中是否包含敏感字符,檢查規則如下:* @param txt* @param beginIndex* @param matchType* @return,如果存在,則返回敏感詞字符的長度,不存在返回0* @version 1.0*/public int CheckSensitiveWord(String txt,int beginIndex,int matchType){boolean flag = false; //敏感詞結束標識位:用于敏感詞只有1位的情況int matchFlag = 0; //匹配標識數默認為0char word = 0;Map nowMap = sensitiveWordMap;for(int i = beginIndex; i < txt.length() ; i++){word = txt.charAt(i);nowMap = (Map) nowMap.get(word); //獲取指定keyif(nowMap != null){ //存在,則判斷是否為最后一個matchFlag++; //找到相應key,匹配標識+1if("1".equals(nowMap.get("isEnd"))){ //如果為最后一個匹配規則,結束循環,返回匹配標識數flag = true; //結束標志位為trueif(minMatchTYpe == matchType){ //最小規則,直接返回,最大規則還需繼續查找break;}}}else{ //不存在,直接返回break;}}if(matchFlag < 2 || !flag){matchFlag = 0;}return matchFlag;}/*** 判斷文字是否包含敏感字符* @author chenming* @date 2014年4月20日 下午4:28:30* @param txt 文字* @param matchType 匹配規則 1:最小匹配規則,2:最大匹配規則* @return 若包含返回true,否則返回false* @version 1.0*/public boolean isContaintSensitiveWord(String txt,int matchType){boolean flag = false;for(int i = 0 ; i < txt.length() ; i++){int matchFlag = this.CheckSensitiveWord(txt, i, matchType); //判斷是否包含敏感字符if(matchFlag > 0){ //大于0存在,返回trueflag = true;}}return flag;}/*** 獲取文字中的敏感詞* @param txt 文字* @param matchType 匹配規則 1:最小匹配規則,2:最大匹配規則* @return* @version 1.0*/public Set<String> getSensitiveWord(String txt , int matchType){Set<String> sensitiveWordList = new HashSet<String>();for(int i = 0 ; i < txt.length() ; i++){int length = CheckSensitiveWord(txt, i, matchType); //判斷是否包含敏感字符if(length > 0){ //存在,加入list中sensitiveWordList.add(txt.substring(i, i+length));i = i + length - 1; //減1的原因,是因為for會自增}}return sensitiveWordList;}/*** 替換敏感字字符* @param txt* @param matchType* @param replaceChar 替換字符,默認** @version 1.0*/public String replaceSensitiveWord(String txt,int matchType,String replaceChar){String resultTxt = txt;Set<String> set = getSensitiveWord(txt, matchType); //獲取所有的敏感詞Iterator<String> iterator = set.iterator();String word = null;String replaceString = null;while (iterator.hasNext()) {word = iterator.next();replaceString = getReplaceChars(replaceChar, word.length());resultTxt = resultTxt.replaceAll(word, replaceString);}return resultTxt;}/*** 獲取替換字符串* @param replaceChar* @param length* @return* @version 1.0*/private String getReplaceChars(String replaceChar,int length){String resultReplace = replaceChar;for(int i = 1 ; i < length ; i++){resultReplace += replaceChar;}return resultReplace;}public static void main(String[] args) {SensitiveWordDAFService sensitiveWordService = new SensitiveWordDAFService();System.out.println("敏感詞數量:"+sensitiveWordMap.size());String query="阿賓白黃牙簽java hadoop 開發 option";System.out.println("待檢測語句字數:" + query.length());long beginTime = System.currentTimeMillis();long endTime = System.currentTimeMillis();Set<String> set = sensitiveWordService.getSensitiveWord(query, 1);System.out.println("語句中包含敏感詞的個數為:" + set.size() + "。包含:" + set);System.out.println("總共消耗時間為:" + (endTime - beginTime));}
原理參考博客 https://blog.csdn.net/cdj0311/article/details/79789480
雙層for 循環去除敏感詞
private List<String> wordDicList=new ArrayList<String>();private List<String> sensitiveWordList=new ArrayList<String>();public SensitiveWordBaseService() {wordDicList = SensitiveWordUtil.getSensitiveWordList();System.out.println("敏感詞加載完成,長度:"+wordDicList.size());}public String filterInfo(String str){sensitiveWordList= new ArrayList<String>();StringBuilder query = new StringBuilder(str);HashMap<Integer, Integer> hash = new HashMap<Integer, Integer>(wordDicList.size());String temp;for(int x = 0; x < wordDicList.size();x++){temp = wordDicList.get(x);int findIndexSize = 0;for(int start = -1;(start=query.indexOf(temp,findIndexSize)) > -1;){//System.out.println("###replace="+temp);findIndexSize = start+temp.length();//從已找到的后面開始找Integer mapStart = hash.get(start);//起始位置if(mapStart == null || (mapStart != null && findIndexSize > mapStart))//滿足1個,即可更新map{hash.put(start, findIndexSize);//System.out.println("###敏感詞:"+buffer.substring(start, findIndexSize));}}}Collection<Integer> values = hash.keySet();for(Integer startIndex : values){Integer endIndex = hash.get(startIndex);//獲取敏感詞,并加入列表,用來統計數量String sensitive = query.substring(startIndex, endIndex);//System.out.println("###敏感詞:"+sensitive);if (!sensitive.contains("*")) {//添加敏感詞到集合wordDicList.add(sensitive);sensitiveWordList.add(sensitive);}query.replace(startIndex, endIndex, "**");}hash.clear();return query.toString();}public static void main(String[] args) {long start = System.currentTimeMillis();String string="java 阿賓 阿賓 hadoop 開發 大保健 option";SensitiveWordBaseService sensitiveWordBaseService=new SensitiveWordBaseService();String s = sensitiveWordBaseService.filterInfo(string);System.out.println(s);System.out.println("用時:"+(System.currentTimeMillis()-start)+"ms");}
通過IK分詞器去除敏感詞,缺點是有的敏感詞會被切分,識別不出來
3.1 添加依賴
<!-- https://mvnrepository.com/artifact/com.janeluo/ikanalyzer --><dependency><groupId>com.janeluo</groupId><artifactId>ikanalyzer</artifactId><version>2012_u6</version></dependency>
3.2 代碼
/*** 敏感詞集合*/public static HashMap sensitiveWordMap=new HashMap();/*** 初始化敏感詞庫** @param sensitiveWordSet 敏感詞庫*/public static synchronized void init(Set<String> sensitiveWordSet) {//初始化敏感詞容器,減少擴容操作sensitiveWordMap = new HashMap(sensitiveWordSet.size());for (String sensitiveWord : sensitiveWordSet) {sensitiveWordMap.put(sensitiveWord, sensitiveWord);}}/*** 判斷文字是否包含敏感字符** @param txt 文字* @return 若包含返回true,否則返回false*/public static boolean contains(String txt) throws Exception {boolean flag = false;List<String> wordList = segment(txt);for (String word : wordList) {if (sensitiveWordMap.get(word) != null) {return true;}}return flag;}/*** 獲取文字中的敏感詞** @param txt 文字* @return*/public static Set<String> getSensitiveWord(String txt) throws IOException {Set<String> sensitiveWordList = new HashSet<String>();List<String> wordList = segment(txt);for (String word : wordList) {if (sensitiveWordMap.get(word) != null) {sensitiveWordList.add(word);}}return sensitiveWordList;}/*** 替換敏感字字符** @param txt 文本* @param replaceChar 替換的字符,匹配的敏感詞以字符逐個替換,如 語句:我愛中國人 敏感詞:中國人,替換字符:*, 替換結果:我愛**** @return*/public static String replaceSensitiveWord(String txt, char replaceChar) throws IOException {String resultTxt = txt;//獲取所有的敏感詞Set<String> sensitiveWordList = getSensitiveWord(txt);String replaceString;for (String sensitiveWord : sensitiveWordList) {replaceString = getReplaceChars(replaceChar, sensitiveWord.length());resultTxt = resultTxt.replaceAll(sensitiveWord, replaceString);}return resultTxt;}/*** 替換敏感字字符** @param txt 文本* @param replaceStr 替換的字符串,匹配的敏感詞以字符逐個替換,如 語句:我愛中國人 敏感詞:中國人,替換字符串:[屏蔽],替換結果:我愛[屏蔽]* @return*/public static String replaceSensitiveWord(String txt, String replaceStr) throws IOException {String resultTxt = txt;//獲取所有的敏感詞Set<String> sensitiveWordList = getSensitiveWord(txt);for (String sensitiveWord : sensitiveWordList) {resultTxt = resultTxt.replaceAll(sensitiveWord, replaceStr);}return resultTxt;}/*** 獲取替換字符串** @param replaceChar* @param length* @return*/private static String getReplaceChars(char replaceChar, int length) {String resultReplace = String.valueOf(replaceChar);for (int i = 1; i < length; i++) {resultReplace += replaceChar;}return resultReplace;}/*** 對語句進行分詞** @param text 語句* @return 分詞后的集合* @throws IOException*/private static List segment(String text) throws IOException {List<String> list = new ArrayList<String>();StringReader re = new StringReader(text);IKSegmenter ik = new IKSegmenter(re, true);Lexeme lex;while ((lex = ik.next()) != null) {list.add(lex.getLexemeText());}return list;}public static void main(String[] args) throws IOException {SensitiveWordUtil sensitiveWordUtil=new SensitiveWordUtil();Set<String> sensitiveWordSet = sensitiveWordUtil.getSensitiveWordSet();sensitiveWordSet.add("太多");sensitiveWordSet.add("愛戀");sensitiveWordSet.add("靜靜");sensitiveWordSet.add("哈哈");sensitiveWordSet.add("啦啦");sensitiveWordSet.add("感動");sensitiveWordSet.add("發呆");//初始化敏感詞庫init(sensitiveWordSet);/*** 需要進行處理的目標字符串*/System.out.println("敏感詞的數量:" + sensitiveWordMap.size());String string = "太多的傷感情懷也許只局限于飼養基地 熒幕中的情節。"+ "然后 我們的扮演的角色就是跟隨著主人公的喜紅客聯盟 怒哀樂而過于牽強的把自己的情感也附加于銀幕情節中,然后感動就流淚,"+ "難過就躺在某一個人的懷里盡情的闡述心扉或者手機卡復制器一個賤人一杯紅酒一部電影在夜 深人靜的晚上,關上電話靜靜的發呆著。";String string="阿賓白黃牙簽java hadoop 開發 option";System.out.println("待檢測語句字數:" + string.length());/*** 是否含有關鍵字*/try {boolean result = contains(string);System.out.println(result);} catch (Exception e) {e.printStackTrace();}/*** 獲取語句中的敏感詞*/Set<String> set = getSensitiveWord(string);System.out.println("語句中包含敏感詞的個數為:" + set.size() + "。包含:" + set);/*** 替換語句中的敏感詞*/String filterStr = replaceSensitiveWord(string, '*');System.out.println(filterStr);String filterStr2 = replaceSensitiveWord(string, "[*敏感詞*]");System.out.println(filterStr2);}
總結
以上是生活随笔為你收集整理的java 去除敏感词的全部內容,希望文章能夠幫你解決所遇到的問題。
如果覺得生活随笔網站內容還不錯,歡迎將生活随笔推薦給好友。