写出用tfidf算法计算多篇中文文本相似度的Android代码
时间: 2024-05-13 15:15:45 浏览: 11
以下是基于TF-IDF算法计算多篇中文文本相似度的Android代码:
1. 首先,需要导入相关的依赖:
```
dependencies {
...
implementation 'com.github.hankcs:hanlp:portable-1.7.8'
implementation 'com.github.hankcs:HanLP-tokenizer:1.0.2'
...
}
```
2. 接着,定义一个方法来计算TF-IDF分数:
```
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.tokenizer.StandardTokenizer;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class TFIDF {
public static Map<String, Float> getTFIDF(String text) {
Map<String, Float> tfidfMap = new HashMap<>();
// 分词
List<Term> terms = StandardTokenizer.segment(text);
// 计算TF
Map<String, Integer> tfMap = new HashMap<>();
for (Term term : terms) {
String word = term.word;
tfMap.put(word, tfMap.getOrDefault(word, 0) + 1);
}
for (String word : tfMap.keySet()) {
float tf = (float) tfMap.get(word) / terms.size();
// 计算IDF
int df = 0;
List<String> docList = getDocList();
for (String doc : docList) {
if (doc.contains(word)) {
df += 1;
}
}
float idf = (float) Math.log((float) docList.size() / (df + 1));
// 计算TF-IDF
tfidfMap.put(word, tf * idf);
}
return tfidfMap;
}
// 模拟多篇文本
private static List<String> getDocList() {
List<String> docList = new ArrayList<>();
docList.add("这是第一篇文本,用于测试。");
docList.add("这是第二篇文本,用于测试。");
docList.add("这是第三篇文本,用于测试。");
return docList;
}
}
```
3. 最后,在Activity中调用该方法来计算多篇文本之间的相似度:
```
public class MainActivity extends AppCompatActivity {
@Override
protected void onCreate(Bundle savedInstanceState) {
super.onCreate(savedInstanceState);
setContentView(R.layout.activity_main);
String text1 = "这是第一篇文本,用于测试。";
String text2 = "这是第二篇文本,用于测试,但与第一篇文本不同。";
String text3 = "这是第三篇文本,用于测试,与前两篇文本都不同。";
Map<String, Float> tfidf1 = TFIDF.getTFIDF(text1);
Map<String, Float> tfidf2 = TFIDF.getTFIDF(text2);
Map<String, Float> tfidf3 = TFIDF.getTFIDF(text3);
float sim12 = getSimilarity(tfidf1, tfidf2);
float sim13 = getSimilarity(tfidf1, tfidf3);
float sim23 = getSimilarity(tfidf2, tfidf3);
Log.d("MainActivity", "similarity between text1 and text2: " + sim12);
Log.d("MainActivity", "similarity between text1 and text3: " + sim13);
Log.d("MainActivity", "similarity between text2 and text3: " + sim23);
}
private float getSimilarity(Map<String, Float> tfidf1, Map<String, Float> tfidf2) {
float numerator = 0;
float denominator1 = 0;
float denominator2 = 0;
for (String word : tfidf1.keySet()) {
float tfidfValue1 = tfidf1.get(word);
float tfidfValue2 = tfidf2.getOrDefault(word, 0f);
numerator += tfidfValue1 * tfidfValue2;
denominator1 += tfidfValue1 * tfidfValue1;
}
for (String word : tfidf2.keySet()) {
float tfidfValue2 = tfidf2.get(word);
denominator2 += tfidfValue2 * tfidfValue2;
}
float denominator = (float) (Math.sqrt(denominator1) * Math.sqrt(denominator2));
if (denominator == 0) {
return 0;
}
return numerator / denominator;
}
}
```