java使用Apriori算法进行关联分析
时间: 2023-09-15 16:20:01 浏览: 164
Apriori算法是一种基于频繁项集的关联规则挖掘算法,它可以用来发现事物之间的相关性,并且在市场营销、销售预测、商品推荐等领域都有广泛的应用。
Java中实现Apriori算法主要分为以下步骤:
1. 数据预处理:将原始数据转化为事务集合形式,每个事务包含多个项,项之间用逗号或其他符号分隔。
2. 建立候选项集:根据用户设定的最小支持度阈值,生成大小为1的候选项集。
3. 频繁项集生成:根据候选项集和事务数据集,计算每个项集的支持度,并根据最小支持度阈值筛选出频繁项集。
4. 关联规则挖掘:根据频繁项集,生成关联规则,并计算每个规则的置信度和支持度。
以下是一个简单的Java代码实现:
```
public class Apriori {
// 计算候选项集
public static List<Set<String>> candidateSet(List<Set<String>> frequentItemSets) {
List<Set<String>> candidateSets = new ArrayList<>();
for (int i = 0; i < frequentItemSets.size(); i++) {
for (int j = i + 1; j < frequentItemSets.size(); j++) {
Set<String> set1 = frequentItemSets.get(i);
Set<String> set2 = frequentItemSets.get(j);
// 求并集
Set<String> candidateSet = new HashSet<>(set1);
candidateSet.addAll(set2);
if (candidateSet.size() == set1.size() + 1) {
candidateSets.add(candidateSet);
}
}
}
return candidateSets;
}
// 计算支持度
public static int supportCount(List<Set<String>> transactions, Set<String> itemSet) {
int count = 0;
for (Set<String> transaction : transactions) {
if (transaction.containsAll(itemSet)) {
count++;
}
}
return count;
}
// 计算频繁项集
public static List<Set<String>> frequentItemSet(List<Set<String>> transactions, double minSupport) {
List<Set<String>> frequentItemSets = new ArrayList<>();
Map<Set<String>, Integer> itemSetCount = new HashMap<>();
// 统计每个项集的支持度计数
for (Set<String> transaction : transactions) {
for (String item : transaction) {
Set<String> itemSet = new HashSet<>();
itemSet.add(item);
if (itemSetCount.containsKey(itemSet)) {
itemSetCount.put(itemSet, itemSetCount.get(itemSet) + 1);
} else {
itemSetCount.put(itemSet, 1);
}
}
}
// 获得频繁项集
for (Set<String> itemSet : itemSetCount.keySet()) {
double support = (double) itemSetCount.get(itemSet) / transactions.size();
if (support >= minSupport) {
frequentItemSets.add(itemSet);
}
}
// 迭代计算频繁项集
List<Set<String>> lastItemSets = frequentItemSets;
while (!lastItemSets.isEmpty()) {
List<Set<String>> candidateSets = candidateSet(lastItemSets);
itemSetCount.clear();
for (Set<String> transaction : transactions) {
for (Set<String> candidateSet : candidateSets) {
if (transaction.containsAll(candidateSet)) {
if (itemSetCount.containsKey(candidateSet)) {
itemSetCount.put(candidateSet, itemSetCount.get(candidateSet) + 1);
} else {
itemSetCount.put(candidateSet, 1);
}
}
}
}
lastItemSets = new ArrayList<>();
for (Set<String> itemSet : itemSetCount.keySet()) {
double support = (double) itemSetCount.get(itemSet) / transactions.size();
if (support >= minSupport) {
frequentItemSets.add(itemSet);
lastItemSets.add(itemSet);
}
}
}
return frequentItemSets;
}
// 计算关联规则
public static List<Rule> associationRules(List<Set<String>> transactions, double minSupport, double minConfidence) {
List<Rule> rules = new ArrayList<>();
List<Set<String>> frequentItemSets = frequentItemSet(transactions, minSupport);
for (Set<String> frequentItemSet : frequentItemSets) {
if (frequentItemSet.size() > 1) {
List<Set<String>> subSets = getSubSets(frequentItemSet);
for (Set<String> subSet : subSets) {
Set<String> complementSet = new HashSet<>(frequentItemSet);
complementSet.removeAll(subSet);
double confidence = (double) supportCount(transactions, frequentItemSet) / supportCount(transactions, subSet);
if (confidence >= minConfidence) {
rules.add(new Rule(subSet, complementSet, confidence));
}
}
}
}
return rules;
}
// 获取所有子集
public static List<Set<String>> getSubSets(Set<String> itemSet) {
List<Set<String>> subSets = new ArrayList<>();
if (itemSet.isEmpty()) {
subSets.add(itemSet);
} else {
List<Set<String>> subSetsWithoutFirst = getSubSets(itemSet.stream().skip(1).collect(Collectors.toSet()));
subSets.addAll(subSetsWithoutFirst);
subSetsWithoutFirst.forEach(subSet -> {
Set<String> subSetWithFirst = new HashSet<>(subSet);
subSetWithFirst.add(itemSet.iterator().next());
subSets.add(subSetWithFirst);
});
}
return subSets;
}
// 关联规则类
public static class Rule {
private Set<String> antecedent;
private Set<String> consequent;
private double confidence;
public Rule(Set<String> antecedent, Set<String> consequent, double confidence) {
this.antecedent = antecedent;
this.consequent = consequent;
this.confidence = confidence;
}
public Set<String> getAntecedent() {
return antecedent;
}
public Set<String> getConsequent() {
return consequent;
}
public double getConfidence() {
return confidence;
}
@Override
public String toString() {
return antecedent + " => " + consequent + " (confidence: " + confidence + ")";
}
}
public static void main(String[] args) {
List<Set<String>> transactions = new ArrayList<>();
transactions.add(new HashSet<>(Arrays.asList("A", "B", "C")));
transactions.add(new HashSet<>(Arrays.asList("A", "C", "D", "E")));
transactions.add(new HashSet<>(Arrays.asList("A", "C", "E", "F")));
transactions.add(new HashSet<>(Arrays.asList("B", "C", "E")));
transactions.add(new HashSet<>(Arrays.asList("B", "D", "E", "F")));
double minSupport = 0.4;
double minConfidence = 0.7;
List<Rule> rules = associationRules(transactions, minSupport, minConfidence);
rules.forEach(System.out::println);
}
}
```
以上代码实现了Apriori算法中的候选项集计算、支持度计算、频繁项集计算和关联规则挖掘等步骤。你可以根据自己的需求进行调整和修改。
阅读全文