Java---SimHash原理与实现
生活随笔
收集整理的這篇文章主要介紹了
Java---SimHash原理与实现
小編覺得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.
Java—SimHash原理與實(shí)現(xiàn)
SimHash 原理
原理鏈接
SimHash 實(shí)現(xiàn)
package GetSimilar;import java.io.BufferedReader; import java.io.FileInputStream; import java.io.InputStreamReader; import java.math.BigInteger; import java.util.HashMap; import java.util.Map;import org.ansj.domain.Result; import org.ansj.domain.Term; import org.ansj.recognition.impl.StopRecognition; import org.ansj.splitWord.analysis.ToAnalysis; import org.apache.commons.lang3.StringUtils; import org.jsoup.Jsoup; import org.jsoup.safety.Whitelist;/*** 完成SimHash的計(jì)算,將每個(gè)詞向量的權(quán)重由idf來決定* * @author Ove**/ public class SimHash2 {private int hashbits = 64; // 分詞后的hash數(shù);public Map<String, Double> IDF;public SimHash2(){IDF = getIDF();}public SimHash2(int hashbits) {this.hashbits = hashbits;}// 讀取本地文件,獲得對應(yīng)詞的idf值public Map<String, Double> getIDF(){Map<String, Double> result = new HashMap<String, Double>();try {// 打開文件FileInputStream fis = new FileInputStream("D:/Document/data/Similar/idf.txt");BufferedReader br = new BufferedReader(new InputStreamReader(fis, "UTF-8"));String line = "";while ((line = br.readLine()) != null) {String[] split = line.split("\t");result.put(split[0], Double.valueOf(split[1]));}// 關(guān)閉文件br.close();fis.close();} catch (Exception e) {e.printStackTrace();}return result;}/*** 清除html標(biāo)簽* * @param content* @return*/private String cleanResume(String content) {// 若輸入為HTML,下面會(huì)過濾掉所有的HTML的tagcontent = Jsoup.clean(content, Whitelist.none());content = StringUtils.lowerCase(content);String[] strings = { " ", "\n", "\r", "\t", "\\r", "\\n", "\\t", " " };for (String s : strings) {content = content.replaceAll(s, "");}return content;}/*** 這個(gè)是對整個(gè)字符串進(jìn)行hash計(jì)算* * @return*/private BigInteger simHash(String tokens) {tokens = cleanResume(tokens); // cleanResume 刪除一些特殊字符int[] v = new int[this.hashbits];Result ansjList = wordAnalyzer(tokens);// System.out.println(ansjList);// 標(biāo)識該文檔中每個(gè)詞出現(xiàn)的次數(shù)Map<String, Integer> wordCount = new HashMap<String, Integer>();Integer count = 0;for (Term term : ansjList) {count = wordCount.get(term.getName());if (count == null) {wordCount.put(term.getName(), 1);} else {wordCount.put(term.getName(), count + 1);}}int len = wordCount.size();String word = "";for (Term term : ansjList) {word = term.getName(); // 分詞字符串// 2、將每一個(gè)分詞hash為一組固定長度的數(shù)列.比如 64bit 的一個(gè)整數(shù).BigInteger t = this.hash(word);for (int i = 0; i < this.hashbits; i++) {BigInteger bitmask = new BigInteger("1").shiftLeft(i);// 3、建立一個(gè)長度為64的整數(shù)數(shù)組(假設(shè)要生成64位的數(shù)字指紋,也可以是其它數(shù)字),// 對每一個(gè)分詞hash后的數(shù)列進(jìn)行判斷,如果是1000...1,那么數(shù)組的第一位和末尾一位加1,// 中間的62位減一,也就是說,逢1加1,逢0減1.一直到把所有的分詞hash數(shù)列全部判斷完畢.double tf = (double) wordCount.get(word) / len;if(IDF.get(word)==null) continue;Double weight = 100 * tf * IDF.get(word); // 添加權(quán)重,權(quán)重應(yīng)改為出現(xiàn)次數(shù),而不是根據(jù)詞性來指定。 // if(weight==null) continue;// if (wordCount.containsKey(word)) {// weight = wordCount.get(word);// }if (t.and(bitmask).signum() != 0) {// 這里是計(jì)算整個(gè)文檔的所有特征的向量和v[i] += weight;} else {v[i] -= weight;}}}BigInteger fingerprint = new BigInteger("0");for (int i = 0; i < this.hashbits; i++) {if (v[i] >= 0) {fingerprint = fingerprint.add(new BigInteger("1").shiftLeft(i));}}return fingerprint;}/*** 對單個(gè)的分詞進(jìn)行hash計(jì)算;* * @param source* @return*/private BigInteger hash(String source) {if (source == null || source.length() == 0) {return new BigInteger("0");} else {/*** 當(dāng)sourece 的長度過短,會(huì)導(dǎo)致hash算法失效,因此需要對過短的詞補(bǔ)償*/while (source.length() < 3) {source = source + source.charAt(0);}char[] sourceArray = source.toCharArray();BigInteger x = BigInteger.valueOf(((long) sourceArray[0]) << 7);BigInteger m = new BigInteger("1000003");BigInteger mask = new BigInteger("2").pow(this.hashbits).subtract(new BigInteger("1"));for (char item : sourceArray) {BigInteger temp = BigInteger.valueOf((long) item);x = x.multiply(m).xor(temp).and(mask);}x = x.xor(new BigInteger(String.valueOf(source.length())));if (x.equals(new BigInteger("-1"))) {x = new BigInteger("-2");}return x;}}/*** 計(jì)算海明距離,海明距離越小說明越相似;* * @param one* @param two* @return*/public int hammingDistance(String s1, String s2) {BigInteger one = this.simHash(s1);BigInteger two = this.simHash(s2);BigInteger m = new BigInteger("1").shiftLeft(this.hashbits).subtract(new BigInteger("1"));BigInteger x = one.xor(two).and(m);int tot = 0;while (x.signum() != 0) {tot += 1;x = x.and(x.subtract(new BigInteger("1")));}return tot;}/*** 計(jì)算文本相似度* * @param s1* @param s2* @return*/public double getSemblance(String s1, String s2) {double i = (double) this.hammingDistance(s1, s2);return 1 - i / this.hashbits;}/*** 對單行文本進(jìn)行分詞,分詞后每個(gè)詞以空格相隔開* * @param line* @return*/public static Result wordAnalyzer(String line) {StopRecognition filter = new StopRecognition();filter.insertStopNatures("w"); // 過濾標(biāo)點(diǎn)filter.insertStopNatures("null"); // 過濾空格filter.insertStopNatures("m"); // 過濾數(shù)詞,會(huì)將 半數(shù) 該詞當(dāng)做是數(shù)詞進(jìn)行過濾String[] stopWords = { "如圖所示", "的", "中", "下列", "說法", "正確", "是", "若", "為", "則", "在" };filter.insertStopWords(stopWords); // 過濾單個(gè)詞Result fliterContent = ToAnalysis.parse(line).recognition(filter);return fliterContent;}public static void main(String[] args) {String s1 = "如圖所示,電源電壓保持不變,閉合開關(guān) S 0,滑動(dòng)變阻器R的滑片向右移動(dòng)的過程中,下列說法正確的是 A.閉合開關(guān)S,若甲、乙均為電壓表,則兩表示數(shù)均變小 B.斷開開關(guān)S,若甲、乙均為電流表,則兩表示數(shù)均變大 C.閉合開關(guān)S,若甲、乙均為電壓表,則甲示數(shù)不變,乙示數(shù)變大 D.斷開開關(guān)S,若甲、乙均為電流表,則乙示數(shù)不變,甲示數(shù)變小";String s2 = "小明今天去吃肯德基";SimHash ob = new SimHash();System.out.println(ob.getSemblance(s1, s2));// System.out.println(new BigInteger("1000003"));}}總結(jié)
以上是生活随笔為你收集整理的Java---SimHash原理与实现的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: php企业网站通讯录管理系统,EML企业
- 下一篇: java美元兑换,(Java实现) 美元