python实现apriori算法,要求能够输入最小支持度和划分最大频繁集数目
时间: 2024-01-22 19:20:36 浏览: 74
由Python实现的频繁项集挖掘Apriori算法
5星 · 资源好评率100%
好的,以下是Python实现Apriori算法的代码,可以根据输入的最小支持度和划分最大频繁集数目来进行数据挖掘:
```python
def load_data_set():
"""
加载数据
"""
data_set = [['I1', 'I2', 'I5'],
['I2', 'I4'],
['I2', 'I3'],
['I1', 'I2', 'I4'],
['I1', 'I3'],
['I2', 'I3'],
['I1', 'I3'],
['I1', 'I2', 'I3', 'I5'],
['I1', 'I2', 'I3']]
return data_set
def create_C1(data_set):
"""
创建C1项集
"""
C1 = set()
for transaction in data_set:
for item in transaction:
item_set = frozenset([item])
C1.add(item_set)
return C1
def is_apriori(Ck_item, Lk_sub1):
"""
判断是否满足Apriori性质
"""
for item in Ck_item:
sub_Ck = Ck_item - frozenset([item])
if sub_Ck not in Lk_sub1:
return False
return True
def create_Ck(Lk_sub1, k):
"""
创建Ck项集
"""
Ck = set()
len_Lk_sub1 = len(Lk_sub1)
list_Lk_sub1 = list(Lk_sub1)
for i in range(len_Lk_sub1):
for j in range(i+1, len_Lk_sub1):
l1 = list(list_Lk_sub1[i])
l2 = list(list_Lk_sub1[j])
l1.sort()
l2.sort()
if l1[0:k-2] == l2[0:k-2]:
Ck_item = list_Lk_sub1[i] | list_Lk_sub1[j]
if is_apriori(Ck_item, Lk_sub1):
Ck.add(Ck_item)
return Ck
def generate_Lk_by_Ck(data_set, Ck, min_support, support_data):
"""
生成Lk项集
"""
item_count = {}
for transaction in data_set:
for item in Ck:
if item.issubset(transaction):
if item not in item_count:
item_count[item] = 1
else:
item_count[item] += 1
num_items = float(len(data_set))
Lk = set()
support_data_temp = {}
for item in item_count:
support = item_count[item] / num_items
if support >= min_support:
Lk.add(item)
support_data_temp[item] = support
support_data.update(support_data_temp)
return Lk
def generate_L(data_set, k, min_support):
"""
生成所有频繁项集
"""
support_data = {}
C1 = create_C1(data_set)
L1 = generate_Lk_by_Ck(data_set, C1, min_support, support_data)
Lk_sub1 = L1.copy()
L = []
L.append(Lk_sub1)
for i in range(2, k+1):
Ci = create_Ck(Lk_sub1, i)
Li = generate_Lk_by_Ck(data_set, Ci, min_support, support_data)
Lk_sub1 = Li.copy()
L.append(Lk_sub1)
return L, support_data
def generate_rules(L, support_data, min_confidence=0.7):
"""
生成关联规则
"""
big_rules_list = []
for i in range(1, len(L)):
for freq_set in L[i]:
H1 = [frozenset([item]) for item in freq_set]
if i > 1:
rules_from_conseq(freq_set, H1, support_data, big_rules_list, min_confidence)
else:
calc_confidence(freq_set, H1, support_data, big_rules_list, min_confidence)
return big_rules_list
def calc_confidence(freq_set, H, support_data, big_rules_list, min_confidence):
"""
计算置信度
"""
pruned_H = []
for conseq in H:
conf = support_data[freq_set] / support_data[freq_set - conseq]
if conf >= min_confidence:
print(freq_set - conseq, "-->", conseq, "conf:", conf)
big_rules_list.append((freq_set - conseq, conseq, conf))
pruned_H.append(conseq)
return pruned_H
def rules_from_conseq(freq_set, H, support_data, big_rules_list, min_confidence):
"""
从 H 中生成更多的关联规则
"""
m = len(H[0])
if len(freq_set) > (m + 1):
Hmp1 = create_Ck(H, m+1)
Hmp1 = calc_confidence(freq_set, Hmp1, support_data, big_rules_list, min_confidence)
if len(Hmp1) > 1:
rules_from_conseq(freq_set, Hmp1, support_data, big_rules_list, min_confidence)
```
接下来,可以使用以下代码进行测试:
```python
data_set = load_data_set()
L, support_data = generate_L(data_set, k=3, min_support=0.5)
rules = generate_rules(L, support_data, min_confidence=0.7)
```
其中,`load_data_set` 函数用于加载数据集,`generate_L` 函数用于生成所有频繁项集,`generate_rules` 函数用于生成关联规则。在 `generate_L` 函数中,可以通过设置参数 `k` 来指定划分最大频繁集数目。在上述代码中,设置 `k=3` 表示最多划分为三个频繁集。同时,还可以通过设置参数 `min_support` 来指定最小支持度的阈值。在生成关联规则时,可以通过设置参数 `min_confidence` 来指定最小置信度的阈值。
阅读全文