java校验中文敏感词
时间: 2023-12-22 14:57:23 浏览: 145
java做敏感词监测
在Java中校验中文敏感词可以使用正则表达式或者基于AC自动机的敏感词过滤算法。
1. 正则表达式
使用正则表达式匹配中文敏感词,可以使用Java自带的Pattern和Matcher类。例如,可以先将敏感词以"|"分隔组成正则表达式,然后匹配要校验的文本。
示例代码:
```java
String sensitiveWords = "敏感词1|敏感词2|敏感词3";
String text = "这是一段包含敏感词2的文本";
Pattern pattern = Pattern.compile(sensitiveWords);
Matcher matcher = pattern.matcher(text);
if (matcher.find()) {
System.out.println("文本包含敏感词:" + matcher.group());
} else {
System.out.println("文本不包含敏感词");
}
```
2. AC自动机
使用AC自动机算法可以实现高效的敏感词过滤。该算法的基本思路是将所有敏感词构建成一棵Trie树,并在Trie树上进行匹配。如果在匹配过程中发现匹配失败,则可以利用AC自动机的Fail指针特性进行快速失败跳转,避免不必要的匹配。
示例代码:
```java
public class SensitiveWordsFilter {
private static final String END_SIGN = "*";
private TrieNode root;
public SensitiveWordsFilter(List<String> words) {
root = new TrieNode(null);
for (String word : words) {
addWord(word);
}
buildFailPointers();
}
private void addWord(String word) {
TrieNode node = root;
for (char c : word.toCharArray()) {
TrieNode child = node.getChild(c);
if (child == null) {
child = new TrieNode(c);
node.addChild(child);
}
node = child;
}
node.setEnd(true);
}
private void buildFailPointers() {
Queue<TrieNode> queue = new LinkedList<>();
root.setFail(null);
queue.offer(root);
while (!queue.isEmpty()) {
TrieNode node = queue.poll();
for (TrieNode child : node.getChildren()) {
if (node == root) {
child.setFail(root);
} else {
TrieNode fail = node.getFail();
while (fail != null) {
TrieNode failChild = fail.getChild(child.getChar());
if (failChild != null) {
child.setFail(failChild);
break;
}
fail = fail.getFail();
}
if (fail == null) {
child.setFail(root);
}
}
queue.offer(child);
}
}
}
public boolean containsSensitiveWords(String text) {
TrieNode node = root;
for (char c : text.toCharArray()) {
while (node != null && !node.hasChild(c)) {
node = node.getFail();
}
node = node.getChild(c);
if (node == null) {
node = root;
}
if (node.isEnd()) {
return true;
}
}
return false;
}
private static class TrieNode {
private Character c;
private Map<Character, TrieNode> children;
private TrieNode fail;
private boolean end;
public TrieNode(Character c) {
this.c = c;
this.children = new HashMap<>();
this.end = false;
}
public Character getChar() {
return c;
}
public void addChild(TrieNode child) {
children.put(child.getChar(), child);
}
public TrieNode getChild(Character c) {
return children.get(c);
}
public boolean hasChild(Character c) {
return children.containsKey(c);
}
public void setFail(TrieNode fail) {
this.fail = fail;
}
public TrieNode getFail() {
return fail;
}
public void setEnd(boolean end) {
this.end = end;
}
public boolean isEnd() {
return end;
}
public Collection<TrieNode> getChildren() {
return children.values();
}
}
}
```
使用示例:
```java
List<String> sensitiveWords = Arrays.asList("敏感词1", "敏感词2", "敏感词3");
SensitiveWordsFilter filter = new SensitiveWordsFilter(sensitiveWords);
String text = "这是一段包含敏感词2的文本";
if (filter.containsSensitiveWords(text)) {
System.out.println("文本包含敏感词");
} else {
System.out.println("文本不包含敏感词");
}
```
阅读全文