java lucene词干提取_lucene 进行去除停用词和词干提取
时间: 2024-01-15 14:03:16 浏览: 134
基于lucene的去词、禁词、搜索等操作
4星 · 用户满意度95%
好的,您可以使用Lucene的分词器和过滤器来进行去除停用词和词干提取。以下是一个示例代码:
```
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.Version;
import java.io.IOException;
import java.io.StringReader;
public class LuceneStemmingDemo {
public static void main(String[] args) throws IOException {
// 原始文本
String text = "Lucene is a Java full-text search engine. " +
"Lucene is not a complete application, but rather a code library " +
"and API that can easily be used to add search capabilities to applications.";
// 创建分词器
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_6_6_0);
// 创建 TokenStream
StringReader reader = new StringReader(text);
TokenStream tokenStream = analyzer.tokenStream("", reader);
// 创建词干提取器
PorterStemmer porterStemmer = new PorterStemmer();
// 添加过滤器
tokenStream = new StopFilter(tokenStream, EnglishAnalyzer.getDefaultStopSet()); // 去除停用词
tokenStream = new PorterStemFilter(tokenStream); // 进行词干提取
// 获取词汇单元属性
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
// 获取位移属性
OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
// 获取类型属性
TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class);
// 遍历 TokenStream
tokenStream.reset();
while (tokenStream.incrementToken()) {
System.out.println("词汇单元:" + charTermAttribute.toString() +
", 起始位置:" + offsetAttribute.startOffset() +
", 结束位置:" + offsetAttribute.endOffset() +
", 词性:" + typeAttribute.type());
// 进行词干提取
porterStemmer.setCurrent(charTermAttribute.toString());
porterStemmer.stem();
String stem = porterStemmer.getCurrent();
System.out.println("词干:" + stem);
}
tokenStream.close();
analyzer.close();
}
}
```
在上面的代码中,我们使用了`StandardAnalyzer`作为分词器,使用`EnglishAnalyzer.getDefaultStopSet()`作为停用词集合,使用`PorterStemFilter`作为词干提取器。在遍历`TokenStream`的过程中,我们调用了`PorterStemmer`的`stem()`方法进行词干提取。最后,记得关闭`TokenStream`和分词器。
阅读全文