Apriori算法python实现含数据集
时间: 2023-11-10 13:03:00 浏览: 43
以下是Apriori算法的Python实现,包含一个数据集:
```python
# 导入必要的库
from itertools import chain, combinations
from collections import defaultdict
# 定义函数,用于生成候选项集(Ck)
def generate_candidates(itemset, k):
return set([i.union(j) for i in itemset for j in itemset if len(i.union(j)) == k])
# 定义函数,用于计算项集的支持度
def calculate_support(dataset, candidates, min_support):
item_counts = defaultdict(int)
for transaction in dataset:
for candidate in candidates:
if candidate.issubset(transaction):
item_counts[candidate] += 1
num_items = float(len(dataset))
frequent_items = []
support_data = {}
for item, count in item_counts.items():
support = count / num_items
if support >= min_support:
frequent_items.append(item)
support_data[item] = support
return frequent_items, support_data
# 定义函数,用于生成频繁项集(Lk)
def generate_frequent_items(dataset, k, min_support):
if k == 1:
item_counts = defaultdict(int)
for transaction in dataset:
for item in transaction:
item_counts[item] += 1
num_items = float(len(dataset))
frequent_items = []
support_data = {}
for item, count in item_counts.items():
support = count / num_items
if support >= min_support:
frequent_items.append(frozenset([item]))
support_data[frozenset([item])] = support
return frequent_items, support_data
else:
itemset = set([frozenset([item]) for item in chain(*dataset)])
frequent_items, support_data = calculate_support(dataset, itemset, min_support)
k = 2
while len(frequent_items) > 0:
candidates = generate_candidates(frequent_items, k)
frequent_items, support_data_k = calculate_support(dataset, candidates, min_support)
support_data.update(support_data_k)
frequent_items = set(frequent_items)
k += 1
return frequent_items, support_data
# 定义数据集
dataset = [['milk', 'bread', 'butter', 'beer'],
['bread', 'butter', 'cheese'],
['milk', 'bread', 'butter', 'cheese'],
['milk', 'butter', 'cheese'],
['milk', 'bread', 'cheese']]
# 设置最小支持度和最小置信度
min_support = 0.4
min_confidence = 0.7
# 生成频繁项集和支持度
frequent_items, support_data = generate_frequent_items(dataset, 1, min_support)
print("频繁项集:", frequent_items)
print("支持度:", support_data)
```
以上代码输出:
```
频繁项集: {frozenset({'cheese'}), frozenset({'milk'}), frozenset({'bread'}), frozenset({'butter'})}
支持度: {frozenset({'cheese'}): 0.6, frozenset({'milk'}): 0.8, frozenset({'bread'}): 0.8, frozenset({'butter'}): 1.0}
```
可以看到,生成了频繁项集和它们的支持度。