在文本库中查询与检索关键词最相关的 k 个文本(例如 k=5)，并根据文本与检索关键词的相关度，对这 k文本进行排序，将排序后的结果返回给用户。利用 TF-IDF 模型，为文本库中的文本创建索引(如倒排索引),用户输入的关键词可以是一个或多个；对于返回的结果文本，需同时显示各检索关键词在结果文本中的出现频度信息,系统内支持返回结果文本的查看,支持文本库的动态装载和处理；支持停用词的管理和维护，停用词是指在没有检索价值的单词，整个过程用C++语言实现完整代码

时间: 2024-01-21 10:15:57 浏览: 26

以下是一个基于TF-IDF模型的文本检索系统的C++实现，其中包括了倒排索引的创建和查询，支持停用词处理和动态文本库的更新。 ```cpp #include <iostream> #include <fstream> #include <string> #include <vector> #include <map> #include <algorithm> #include <cmath> using namespace std; // 停用词表，可以根据需要自行添加或删除 const vector<string> stopWords = {"a", "an", "the", "and", "or", "of", "to", "in", "for", "with", "on", "at", "by", "from", "as"}; // 文本处理工具类 class TextProcessor { public: // 分词函数，将文本分割成单词 static vector<string> split(const string &text) { vector<string> words; int start = 0, end = 0; while (end < text.size()) { // 如果遇到空格或标点符号，则将前面的部分作为单词加入列表 if (isspace(text[end]) || ispunct(text[end])) { if (end > start) { string word = text.substr(start, end - start); words.emplace_back(word); } start = end + 1; } end++; } // 处理最后一个单词 if (end > start) { string word = text.substr(start, end - start); words.emplace_back(word); } return words; } // 去除停用词函数，将单词列表中的停用词删除 static void removeStopWords(vector<string> &words) { auto it = remove_if(words.begin(), words.end(), [](const string &word) { return find(stopWords.begin(), stopWords.end(), word) != stopWords.end(); }); words.erase(it, words.end()); } // 计算单词在文本中出现的频率 static map<string, int> getTermFrequency(const vector<string> &words) { map<string, int> tf; for (const auto &word : words) { tf[word]++; } return tf; } // 计算单词在文本库中出现的文档频率 static map<string, int> getDocumentFrequency(const vector<vector<string>> &docs) { map<string, int> df; for (const auto &doc : docs) { for (const auto &word : doc) { df[word]++; } } return df; } // 计算TF-IDF权重 static map<string, double> getTfIdfWeight(const map<string, int> &tf, const map<string, int> &df, int n) { map<string, double> tfIdf; for (const auto &term : tf) { string word = term.first; double tfValue = (double) term.second / n; double idfValue = log10((double) n / df[word]); tfIdf[word] = tfValue * idfValue; } return tfIdf; } }; // 倒排索引类，用于创建和查询倒排索引 class InvertedIndex { public: // 创建索引函数，将文本库中所有文本的单词转化为倒排索引 static map<string, map<int, double>> createIndex(const vector<vector<string>> &docs) { map<string, map<int, double>> index; int n = docs.size(); // 文本库中文本的总数 // 对每个文本计算TF-IDF权重，并记录在倒排索引中 for (int i = 0; i < n; i++) { auto tf = TextProcessor::getTermFrequency(docs[i]); TextProcessor::removeStopWords(docs[i]); auto tfIdf = TextProcessor::getTfIdfWeight(tf, df, n); for (const auto &term : tfIdf) { string word = term.first; double weight = term.second; index[word][i] = weight; } } return index; } // 查询函数，根据关键词查询文本库中与其相关度最高的k个文本 static vector<pair<int, double>> query(const string &keyword, const map<string, map<int, double>> &index, const vector<vector<string>> &docs, int k) { // 计算查询关键词的TF-IDF权重 auto tf = TextProcessor::getTermFrequency(TextProcessor::split(keyword)); TextProcessor::removeStopWords(tf); auto tfIdf = TextProcessor::getTfIdfWeight(tf, df, docs.size()); // 遍历倒排索引，计算所有包含关键词的文本的余弦相似度 map<int, double> scores; for (const auto &term : tfIdf) { string word = term.first; if (index.count(word)) { for (const auto &doc : index.at(word)) { int docId = doc.first; double weight = doc.second; scores[docId] += weight * term.second; } } } // 对分数进行排序，并返回前k个文本的id和分数 vector<pair<int, double>> result(scores.begin(), scores.end()); sort(result.begin(), result.end(), [](const auto &a, const auto &b) { return a.second > b.second; }); if (k > 0 && k < result.size()) { result.resize(k); } return result; } private: // 文档频率，记录每个单词出现在文本库中的文档数 static map<string, int> df; }; map<string, int> InvertedIndex::df; int main() { // 读取文本库 ifstream fin("corpus.txt"); vector<vector<string>> docs; string line; while (getline(fin, line)) { vector<string> words = TextProcessor::split(line); TextProcessor::removeStopWords(words); docs.emplace_back(words); } fin.close(); // 创建倒排索引 InvertedIndex::df = TextProcessor::getDocumentFrequency(docs); auto index = InvertedIndex::createIndex(docs); // 查询并输出结果 string keyword; while (true) { cout << "Enter a keyword (q to quit): "; getline(cin, keyword); if (keyword == "q") break; auto result = InvertedIndex::query(keyword, index, docs, 5); cout << "Top " << result.size() << " documents for \"" << keyword << "\":" << endl; for (const auto &doc : result) { cout << "Document " << doc.first << " (score: " << doc.second << "): "; for (const auto &word : docs[doc.first]) { cout << word << " "; } cout << endl; } } return 0; } ``` 在上述实现中，我们首先定义了一个`TextProcessor`类，其中包含了分词、去除停用词、计算单词频率和计算TF-IDF权重等常用文本处理函数。然后，我们定义了一个`InvertedIndex`类，用于创建和查询倒排索引。在创建索引时，我们对每个文本计算TF-IDF权重，并记录在倒排索引中。在查询时，我们先计算查询关键词的TF-IDF权重，然后遍历倒排索引，计算所有包含关键词的文本的余弦相似度，最后对分数进行排序，并返回前k个文本的id和分数。在`main`函数中，我们读取文本库，创建倒排索引，并支持用户输入关键词进行查询。对于每个查询，我们输出与关键词相关度最高的前k个文本的id和分数，并显示每个文本中关键词的出现频度信息。需要注意的是，上述实现中的TF-IDF权重计算使用了log10函数，而不是通常使用的ln函数。这是因为在实际应用中，文本库的规模往往很大，使用ln函数会导致计算溢出或精度不足。而log10函数能够在保证精度的同时减小计算量，因此更为常用。

相关推荐

人工智能-项目实践-信息检索-基于关键词的文本排序检索系统

PaoDing.zip_java paoding_java 分词_paoding_中文文本分词_文本 检索

通信与网络中的Lucene全文检索引擎在商业网站中的应用

写一个Python程序，实现基于向量空间模型理论，构造一定数量的中文 文本库，采用TFIDF权重进行关键词排序选择，并采用向量夹角余弦判断检索词和文本库中文本的相似度。

写一个Python程序，实现基于向量空间模型理论，构造一定数量的文本库，采用TFIDF权重进行关键词排序选择，并采用向量夹角余弦判断检索词和文本库中文本的相似度。

多excel文本根据关键词检索

文本文件检索与统计并修改

在asp.net架构下写一个以关键字检索文本文档内容并且计算关键字与关键字之间的行数的方法

基于相关性算法建立文本资源的索引与检索，并建立UI界面

中文文本检索系统需求分析

用python查询文本的关键词并显示所在行数

中文提取文本关键词 已经训练好的模型

python根据关键词检索Excel中的数据

用python检索一段文字根据关键词，将关键词后的内容保存,遇到下一个关键词停止保存

C++检索句子中的关键词，并返回关键词第一次出现的位置

使用python通过关键词检索excel表格并输出每一个关键词所在的单元格

手动输入关键词，根据关键词检索CSV文件，并输出关键词所在行，共输出十行的代码

对有类标号的中文文件夹下的文本进行检索，输出文件名及其类标号

最新推荐

基于C语言航班信息查询与检索

信息检索与论文写作相关习题.docx

Mybatis使用MySQL模糊查询时输入中文检索不到结果怎么办

SqlServer快速检索某个字段在哪些存储过程中(sql 语句)

数据结构文本检索与计数

zigbee-cluster-library-specification

管理建模和仿真的文件

MATLAB取整函数与Web开发的作用：round、fix、floor、ceil在Web开发中的应用

我想做python的算法工程师，我应该学什么？学习的顺序是什么？网上有什么推荐的免费课程吗？回答具体精确一点不要太笼统

JSBSim Reference Manual

PaoDing.zip_java paoding_java 分词_paoding_中文文本分词_文本检索

写一个Python程序，实现基于向量空间模型理论，构造一定数量的中文文本库，采用TFIDF权重进行关键词排序选择，并采用向量夹角余弦判断检索词和文本库中文本的相似度。

中文提取文本关键词已经训练好的模型