JAVA实现计算simhash距离的代码
时间: 2023-04-03 18:01:18 浏览: 69
以下是JAVA实现计算simhash距离的代码:
```
import java.math.BigInteger;
import java.util.StringTokenizer;
public class SimHash {
private String tokens;
private BigInteger hash;
public SimHash(String tokens) {
this.tokens = tokens;
this.hash = this.simHash();
}
public BigInteger getHash() {
return this.hash;
}
public int hammingDistance(SimHash other) {
BigInteger x = this.hash.xor(other.hash);
int distance = ;
while (x.signum() != ) {
distance += 1;
x = x.and(x.subtract(BigInteger.ONE));
}
return distance;
}
private BigInteger simHash() {
int[] v = new int[64];
StringTokenizer tokenizer = new StringTokenizer(this.tokens);
while (tokenizer.hasMoreTokens()) {
String token = tokenizer.nextToken();
BigInteger hash = HashUtil.hash(token);
for (int i = ; i < 64; i++) {
BigInteger bitmask = BigInteger.ONE.shiftLeft(i);
if (hash.and(bitmask).signum() != ) {
v[i] += 1;
} else {
v[i] -= 1;
}
}
}
BigInteger fingerprint = BigInteger.ZERO;
for (int i = ; i < 64; i++) {
if (v[i] >= ) {
fingerprint = fingerprint.add(BigInteger.ONE.shiftLeft(i));
}
}
return fingerprint;
}
public static void main(String[] args) {
String s1 = "This is a test string for simhash";
String s2 = "This is a test string for simhash, which is used to calculate the distance between two strings.";
SimHash hash1 = new SimHash(s1);
SimHash hash2 = new SimHash(s2);
System.out.println(hash1.hammingDistance(hash2));
}
}
class HashUtil {
public static BigInteger hash(String str) {
if (str == null || str.length() == ) {
return BigInteger.ZERO;
}
char[] charArray = str.toCharArray();
BigInteger x = BigInteger.valueOf(((long) charArray[]) << 7);
long m = 1000003;
long mask = (1L << 31) - 1;
for (char c : charArray) {
BigInteger temp = BigInteger.valueOf((long) c);
x = x.multiply(BigInteger.valueOf(m)).xor(temp).and(BigInteger.valueOf(mask));
}
x = x.xor(BigInteger.valueOf(charArray.length));
if (x.equals(BigInteger.valueOf(-1))) {
x = BigInteger.valueOf(-2);
}
return x;
}
}
```