TF-IDF算法在搜索模块Java实现
时间: 2023-12-28 22:03:52 浏览: 23
TF-IDF算法是一种常用的文本相似度计算方法,常用于信息检索领域。下面是在搜索模块中使用Java实现TF-IDF算法的基本步骤:
1. 预处理:对文本进行分词,去除停用词(如“的”、“是”、“在”等无实际意义的词)和标点符号等。
2. 计算词频(TF):对于每个文档,统计每个词出现的次数,得到该词的词频。
3. 计算逆文档频率(IDF):对于每个词,统计出现该词的文档数,计算该词的逆文档频率。
4. 计算TF-IDF值:将TF和IDF相乘,得到每个词的TF-IDF值。
5. 计算文档相似度:对于每个查询,计算查询词的TF-IDF值,然后计算查询词与每个文档的余弦相似度,得到文档与查询的相似度。
下面是Java实现的示例代码:
```java
import java.util.*;
import java.io.*;
public class TFIDF {
// 读取文本文件
public static String readFile(String fileName) {
String content = "";
try {
FileReader fr = new FileReader(fileName);
BufferedReader br = new BufferedReader(fr);
String line;
while ((line = br.readLine()) != null) {
content += line;
}
br.close();
fr.close();
} catch (Exception e) {
e.printStackTrace();
}
return content;
}
// 对文本进行分词
public static List<String> tokenize(String text) {
List<String> tokens = new ArrayList<>();
String[] words = text.split("\\s+");
for (String word : words) {
tokens.add(word.replaceAll("\\W", "").toLowerCase());
}
return tokens;
}
// 去除停用词
public static List<String> removeStopWords(List<String> tokens) {
List<String> filteredTokens = new ArrayList<>();
String[] stopWords = {"的", "是", "在", "等"};
for (String token : tokens) {
boolean isStopWord = false;
for (String stopWord : stopWords) {
if (token.equals(stopWord)) {
isStopWord = true;
break;
}
}
if (!isStopWord) {
filteredTokens.add(token);
}
}
return filteredTokens;
}
// 计算词频
public static Map<String, Integer> calculateTermFrequency(List<String> tokens) {
Map<String, Integer> termFrequency = new HashMap<>();
for (String token : tokens) {
if (termFrequency.containsKey(token)) {
termFrequency.put(token, termFrequency.get(token) + 1);
} else {
termFrequency.put(token, 1);
}
}
return termFrequency;
}
// 计算逆文档频率
public static Map<String, Double> calculateInverseDocumentFrequency(List<List<String>> documents) {
Map<String, Double> inverseDocumentFrequency = new HashMap<>();
int numDocuments = documents.size();
for (List<String> document : documents) {
Set<String> uniqueWords = new HashSet<>(document);
for (String word : uniqueWords) {
if (inverseDocumentFrequency.containsKey(word)) {
inverseDocumentFrequency.put(word, inverseDocumentFrequency.get(word) + 1);
} else {
inverseDocumentFrequency.put(word, 1.0);
}
}
}
for (String word : inverseDocumentFrequency.keySet()) {
double idf = Math.log(numDocuments / inverseDocumentFrequency.get(word));
inverseDocumentFrequency.put(word, idf);
}
return inverseDocumentFrequency;
}
// 计算TF-IDF值
public static Map<String, Double> calculateTFIDF(Map<String, Integer> termFrequency, Map<String, Double> inverseDocumentFrequency) {
Map<String, Double> tfidf = new HashMap<>();
for (String word : termFrequency.keySet()) {
double tf = termFrequency.get(word);
double idf = inverseDocumentFrequency.get(word);
tfidf.put(word, tf * idf);
}
return tfidf;
}
// 计算余弦相似度
public static double cosineSimilarity(Map<String, Double> tfidf1, Map<String, Double> tfidf2) {
double dotProduct = 0.0;
double norm1 = 0.0;
double norm2 = 0.0;
for (String word : tfidf1.keySet()) {
if (tfidf2.containsKey(word)) {
dotProduct += tfidf1.get(word) * tfidf2.get(word);
}
norm1 += Math.pow(tfidf1.get(word), 2);
}
for (String word : tfidf2.keySet()) {
norm2 += Math.pow(tfidf2.get(word), 2);
}
return dotProduct / (Math.sqrt(norm1) * Math.sqrt(norm2));
}
public static void main(String[] args) {
// 读取文档
String document1 = readFile("document1.txt");
String document2 = readFile("document2.txt");
// 分词
List<String> tokens1 = tokenize(document1);
List<String> tokens2 = tokenize(document2);
// 去除停用词
List<String> filteredTokens1 = removeStopWords(tokens1);
List<String> filteredTokens2 = removeStopWords(tokens2);
// 计算词频
Map<String, Integer> termFrequency1 = calculateTermFrequency(filteredTokens1);
Map<String, Integer> termFrequency2 = calculateTermFrequency(filteredTokens2);
// 计算逆文档频率
List<List<String>> documents = new ArrayList<>();
documents.add(filteredTokens1);
documents.add(filteredTokens2);
Map<String, Double> inverseDocumentFrequency = calculateInverseDocumentFrequency(documents);
// 计算TF-IDF值
Map<String, Double> tfidf1 = calculateTFIDF(termFrequency1, inverseDocumentFrequency);
Map<String, Double> tfidf2 = calculateTFIDF(termFrequency2, inverseDocumentFrequency);
// 计算余弦相似度
double similarity = cosineSimilarity(tfidf1, tfidf2);
System.out.println("相似度:" + similarity);
}
}
```
注意:在实际使用中,需要对搜索关键词进行分词、去除停用词等预处理,然后计算关键词的TF-IDF值,再与文档进行比较,得到相似度排名。