simHash的java實現:java
import com.hankcs.hanlp.seg.common.Term; import com.hankcs.hanlp.tokenizer.StandardTokenizer; import java.math.BigInteger; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.lang3.StringUtils; import org.jsoup.Jsoup; import org.jsoup.safety.Whitelist; /** * Created by Yangyang Deng on 17-9-7. */ public class SimhashAlgoService { public static void main(String[] args) { SimhashAlgoService simhashAlgoService = new SimhashAlgoService(); String string = "勞斯萊斯女神\n" + "\n" + "這個車標的設計者是英國畫家兼雕刻家查爾斯·賽克斯。20世紀初,經朋友蒙塔古邀請,賽克斯負責爲勞斯萊斯設計一尊雕塑車標。當時,已婚的蒙塔古瘋狂地愛着他的女祕書桑頓,懇請賽克斯以桑頓爲原型設計車標。因此,賽克斯的最初設計中,雕像是一尊披着長袍的女人將手指放在嘴脣上,象徵着蒙塔古與桑頓之間不能說的祕密情史。這個戀愛故事歷經重重磨難,桑頓身份地位曾是脫衣舞女郎,因此兩人根本沒法在一塊兒生活,在獲得家庭與蒙塔古妻子的諒解後,兩人最終能夠走到一塊兒,不幸的是,後來桑頓在一次乘船旅行中不幸遭遇德軍水雷,永遠沉入了冰冷的大海。\n" + "\n" + "後來,他們這段美好的愛情又略帶悽慘故事就保留在了這個車標上,羅 -羅二人也是蒙塔古的好友,他們得知這件事以後很是感動。後來,他們邀請賽克斯又把它改成雙手如羽翼般向後伸展的形象,也就是今天的「飛天女神」。 1911年,它正式成爲勞斯萊斯車的車標。今後,勞斯萊斯的飛天女神車標更是美麗的愛情象徵了!"; // 返回的指紋已經被切分紅4段,方便利用指紋做對比。具體對比方式可自行百度。 List<String> fingerPrints = simhashAlgoService.simHash(string,64); System.out.println(fingerPrints); } private StandardTokenizer hanlpService; // 待分詞的文本 private String tokens; // 十進制的指紋 private BigInteger intSimHash; // 二進制的指紋 private String strSimHash; // 二進制指紋的4個子指紋 private String strSimHashA; private String strSimHashB; private String strSimHashC; private String strSimHashD; private Map<String,Integer> wordCount; private int overCount = 5; public BigInteger getIntSimHash(){ return this.intSimHash; } public String getStrSimHash() { return this.strSimHash; } private String getStrSimHashA() { return this.strSimHashA; } private String getStrSimHashB() { return this.strSimHashB; } private String getStrSimHashC() { return this.strSimHashC; } private String getStrSimHashD() { return this.strSimHashD; } // 指紋的長度 private int hashbits = 64; // 停用的詞性 private Map<String,String> stopNatures = new HashMap<String, String>(); // 詞性的權重 private Map<String, Integer> weightOfNature = new HashMap<String, Integer>(); public void setTokens(String tokens) { this.tokens = tokens; } public void setHashbits(int hashbits) { this.hashbits = hashbits; } private void setMap() { // 停用詞性爲w:標點 this.stopNatures.put("w",""); // 個性化設置詞性權重,這裏將n:名詞設置爲2。(默認權重爲1) this.weightOfNature.put("n",2); } private String preProcess(String content) { // 若輸入爲HTML,下面會過濾掉全部的HTML的tag content = Jsoup.clean(content, Whitelist.none()); content = StringUtils.lowerCase(content); String[] strings = {" ","\n","\\r","\\n","\\t"," "}; for (String s:strings) { content = content.replace(s,""); } return content; } public List<String> simHash(String tokens, int hashbits) { tokens = preProcess(tokens); // cleanResume 刪除簡歷固有文字 this.tokens = cleanResume(tokens); this.hashbits = hashbits; this.wordCount = new HashMap<String, Integer>(); setMap(); // 定義特徵向量/數組 int[] v = new int[this.hashbits]; // 一、將文本去掉格式後, 分詞. List<Term> termList = StandardTokenizer.segment(this.tokens); for (Term term:termList){ String word = term.word; String nature = term.nature.toString(); // 過濾超頻詞 if (this.wordCount.containsKey(word)) { int count = this.wordCount.get(word); if (count>this.overCount) {continue;} this.wordCount.put(word,count+1); } else { this.wordCount.put(word,1); } // 過濾停用詞性 if (this.stopNatures.containsKey(nature)) {continue;} // 二、將每個分詞hash爲一組固定長度的數列.好比 64bit 的一個整數. BigInteger t = this.hash(word); for (int i = 0; i < this.hashbits; i++) { BigInteger bitmask = new BigInteger("1").shiftLeft(i); // 三、創建一個長度爲64的整數數組(假設要生成64位的數字指紋,也能夠是其它數字), // 對每個分詞hash後的數列進行判斷,若是是1000...1,那麼數組的第一位和末尾一位加1, // 中間的62位減一,也就是說,逢1加1,逢0減1.一直到把全部的分詞hash數列所有判斷完畢. int weight = 1; if (this.weightOfNature.containsKey(nature)) { weight = this.weightOfNature.get(nature); } if (t.and(bitmask).signum() != 0) { // 這裏是計算整個文檔的全部特徵的向量和 v[i] += weight; } else { v[i] -= weight; } } } BigInteger fingerprint = new BigInteger("0"); StringBuffer simHashBuffer = new StringBuffer(); for (int i = 0; i < this.hashbits; i++) { // 四、最後對數組進行判斷,大於0的記爲1,小於等於0的記爲0,獲得一個 64bit 的數字指紋/簽名. if (v[i] >= 0) { fingerprint = fingerprint.add(new BigInteger("1").shiftLeft(i)); simHashBuffer.append("1"); } else { simHashBuffer.append("0"); } } this.strSimHash = simHashBuffer.toString(); this.strSimHashA = simHashBuffer.substring(0,16); this.strSimHashB = simHashBuffer.substring(16,32); this.strSimHashC = simHashBuffer.substring(32,48); this.strSimHashD = simHashBuffer.substring(48,64); this.intSimHash = fingerprint; List<String> simHashList = new ArrayList<String>(); simHashList.add(this.getStrSimHashA()); simHashList.add(this.getStrSimHashB()); simHashList.add(this.getStrSimHashC()); simHashList.add(this.getStrSimHashD()); return simHashList; } private BigInteger hash(String source) { if (source == null || source.length() == 0) { return new BigInteger("0"); } else { /** * 當sourece 的長度太短,會致使hash算法失效,所以須要對太短的詞補償 */ while (source.length()<3) { source = source+source.charAt(0); } char[] sourceArray = source.toCharArray(); BigInteger x = BigInteger.valueOf(((long) sourceArray[0]) << 7); BigInteger m = new BigInteger("1000003"); BigInteger mask = new BigInteger("2").pow(this.hashbits).subtract(new BigInteger("1")); for (char item : sourceArray) { BigInteger temp = BigInteger.valueOf((long) item); x = x.multiply(m).xor(temp).and(mask); } x = x.xor(new BigInteger(String.valueOf(source.length()))); if (x.equals(new BigInteger("-1"))) { x = new BigInteger("-2"); } return x; } } // 用於計算十進制的hamming距離 public int hammingDistance(SimhashAlgoService other) { BigInteger x = this.intSimHash.xor(other.intSimHash); int tot = 0; // 統計x中二進制位數爲1的個數 // 咱們想一想,一個二進制數減去1,那麼,從最後那個1(包括那個1)後面的數字全都反了,對吧,而後,n&(n-1)就至關於把後面的數字清0, // 咱們看n能作多少次這樣的操做就OK了。 while (x.signum() != 0) { tot += 1; x = x.and(x.subtract(new BigInteger("1"))); } return tot; } // 用於計算二進制的hamming距離 public int getDistance(String str1, String str2) { int distance; if (str1.length() != str2.length()) { distance = -1; } else { distance = 0; for (int i = 0; i < str1.length(); i++) { if (str1.charAt(i) != str2.charAt(i)) { distance++; } } } return distance; } public List subByDistance(SimhashAlgoService Simhash, int distance) { // 分紅幾組來檢查 int numEach = this.hashbits / (distance + 1); List characters = new ArrayList(); StringBuffer buffer = new StringBuffer(); int k = 0; for (int i = 0; i < this.intSimHash.bitLength(); i++) { // 當且僅當設置了指定的位時,返回 true boolean sr = Simhash.intSimHash.testBit(i); if (sr) { buffer.append("1"); } else { buffer.append("0"); } if ((i + 1) % numEach == 0) { // 將二進制轉爲BigInteger BigInteger eachValue = new BigInteger(buffer.toString(), 2); System.out.println("----" + eachValue); buffer.delete(0, buffer.length()); characters.add(eachValue); } } return characters; } // 過濾無關內容 private String cleanResume(String content) { String[] tobeReplace = { "\n","\r","\t","\\n","\\r","\\t" }; for (String s:tobeReplace) { content = content.replace(s,""); } return content; } }
pom文件依賴:算法
<dependencies>
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>portable-1.3.4</version>
</dependency>apache
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.4</version>
</dependency>數組
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.3</version>
</dependency>app
</dependencies>this