1 simhash的原理,比較詳細的參考連接:http://my.oschina.net/leejun2005/blog/150086 2 simhash的php實現(以下)。主要用到了php的類庫gmp 3 ps:oshchina代碼編輯器真心難用,你們將就一下。。之後改好哈。但願能改進!! 4 由於工做緣由用php重寫了simhash算法php
<!-- lang: php --> <?php /** @author cabing_2005@126.com **/ class Simhash { public $m_hash = null; public $hashbits = null; //構造函數 public function __construct($tokens=array(), $hashbits=128) { $this->m_hashbits = $hashbits; $this->m_hash = $this->simhash($tokens); } //to string public function __toString() { return strval($this->m_hash); } //返回hash值 public function simhash($tokens) { if(!is_array($tokens)) { throw new Exception("tokens should be array"); } $v = array_fill(0,$this->m_hashbits,0); foreach($tokens as $x) { $x = $this->stringHash($x); for($i=0;$i<$this->m_hashbits;$i++) { $bitmask = gmp_init(1); gmp_setbit($bitmask, $i); $bitmask = gmp_sub($bitmask,1); if (gmp_strval(gmp_and($x,$bitmask)) != "0") { $v[$i] += 1; } else { $v[$i] -= 1; } } } $sum = 0; for($i=0;$i<$this->m_hashbits;$i++) { if ($v[$i] >= 0) { $num = gmp_init(1); gmp_setbit($num, $i); $num = gmp_sub($num,1); $sum = gmp_add($sum,$num); } } return gmp_strval($sum); } //求海明距離 public function hammingDistance($other) { $a = gmp_init($this->m_hash); $b = gmp_init($other->m_hash); $c = gmp_init(1); gmp_setbit($c, $this->m_hashbits); $c = gmp_sub($c,2); $x = gmp_and(gmp_xor($a,$b),$c); $tot = 0; while(gmp_strval($x)) { $tot += 1; $x = gmp_and($x,gmp_sub($x,1)); } return $tot; } //求類似度 public function similarity ($other) { $a = floatval($this->m_hash); $b = floatval($other->m_hash); if($a > $b) { return $b/$a; } else { return $a/$b; } } public function stringHash($source) { if(empty($source)) { return 0; } else { $x = ord($source[0]) << 7; $m = 1000003; $mask = gmp_sub(gmp_pow("2", $this->m_hashbits),1); $len = strlen($source); for($i=0;$i<$len;$i++) { $x = gmp_and(gmp_xor(gmp_mul($x,$m),ord($source[$i])),$mask); } $x = gmp_xor($x,$len); if(intval(gmp_strval($x)) == -1) { $x = -2; } return $x; } } } $s = 'This is a test string for testing'; $hash1 = new Simhash(explode(" ",$s)); $s = 'This is a test string for testing also'; $hash2 = new Simhash(explode(" ",$s)); $s = 'nai nai ge xiong cao'; $hash3 = new Simhash(explode(" ",$s)); var_dump($hash1->hammingDistance($hash2) , " " , $hash1->similarity($hash2)); var_dump($hash1->hammingDistance($hash3) , " " , $hash1->similarity($hash3));