请帮我给下以下代码添加注释def apriori_gen(Lk, k): retList = [] len_Lk = len(Lk) for i in range(len_Lk): for j in range(i+1, len_Lk): L1 = list(Lk[i])[:k-2] L2 = list(Lk[j])[:k-2] L1.sort() L2.sort() if L1 == L2: retList.append(Lk[i] | Lk[j]) return retList
时间: 2024-02-14 13:13:50 浏览: 126
这段代码是实现 Apriori 算法中的 Apriori-Gen 步骤,用于生成由频繁项集 Lk-1 产生的候选项集 Ck。下面是对代码的注释:
```python
def apriori_gen(Lk, k): # 输入为频繁项集 Lk 和项集元素个数 k
retList = [] # 初始化候选项集列表
len_Lk = len(Lk) # 频繁项集 Lk 的长度
for i in range(len_Lk):
for j in range(i+1, len_Lk):
# 选取两个频繁项集
L1 = list(Lk[i])[:k-2] # 取出 Lk[i] 的前 k-1 个元素
L2 = list(Lk[j])[:k-2] # 取出 Lk[j] 的前 k-1 个元素
L1.sort() # 排序
L2.sort()
if L1 == L2: # 如果前 k-1 个元素相同,合并两个集合
retList.append(Lk[i] | Lk[j])
return retList # 返回生成的候选项集列表
```
相关问题
def create_C1(dataset): C1 = [] for transaction in dataset: for item in transaction: if not [item] in C1: C1.append([item]) C1.sort() return list(map(frozenset, C1)) def scan_D(D, Ck, min_support): ssCnt = {} for tid in D: for can in Ck: if can.issubset(tid): if not can in ssCnt: ssCnt[can] = 1 else: ssCnt[can] += 1 num_items = float(len(D)) retList = [] support_data = {} for key in ssCnt: support = ssCnt[key] / num_items if support >= min_support: retList.insert(0, key) support_data[key] = support return retList, support_data def apriori_gen(Lk, k): retList = [] len_Lk = len(Lk) for i in range(len_Lk): for j in range(i+1, len_Lk): L1 = list(Lk[i])[:k-2] L2 = list(Lk[j])[:k-2] L1.sort() L2.sort() if L1 == L2: retList.append(Lk[i] | Lk[j]) return retList def apriori(dataset, min_support=0.5): C1 = create_C1(dataset) D = list(map(set, dataset)) L1, support_data = scan_D(D, C1, min_support) L = [L1] k = 2 while len(L[k-2]) > 0: Ck = apriori_gen(L[k-2], k) Lk, support_k = scan_D(D, Ck, min_support) support_data.update(support_k) L.append(Lk) k += 1 return L, support_data再此代码基础上增加置信度
在Apriori算法中,可以通过计算关联规则的置信度来评估规则的可靠性。置信度表示当一个规则的前提项出现时,结论项也会出现的概率。可以使用以下代码来计算关联规则的置信度:
```python
def generate_rules(L, support_data, min_confidence=0.7):
"""生成关联规则"""
big_rules_list = []
for i in range(1, len(L)):
for freq_set in L[i]:
H1 = [frozenset([item]) for item in freq_set]
if i > 1:
rules_from_conseq(freq_set, H1, support_data, big_rules_list, min_confidence)
else:
calc_confidence(freq_set, H1, support_data, big_rules_list, min_confidence)
return big_rules_list
def calc_confidence(freq_set, H, support_data, brl, min_confidence=0.7):
"""计算规则的置信度"""
pruned_H = []
for conseq in H:
conf = support_data[freq_set] / support_data[freq_set - conseq]
if conf >= min_confidence:
print(freq_set - conseq, "-->", conseq, "conf:", conf)
brl.append((freq_set - conseq, conseq, conf))
pruned_H.append(conseq)
return pruned_H
def rules_from_conseq(freq_set, H, support_data, brl, min_confidence=0.7):
"""从H中生成更多的关联规则"""
m = len(H[0])
if len(freq_set) > (m + 1):
Hmp1 = apriori_gen(H, m + 1)
Hmp1 = calc_confidence(freq_set, Hmp1, support_data, brl, min_confidence)
if len(Hmp1) > 1:
rules_from_conseq(freq_set, Hmp1, support_data, brl, min_confidence)
```
其中,`generate_rules`函数用于生成关联规则,`calc_confidence`函数用于计算规则的置信度,`rules_from_conseq`函数用于从H中生成更多的关联规则。在计算置信度时,需要计算规则的支持度以及规则的前提项和结论项的支持度,然后使用支持度计算置信度,并与最小置信度进行比较。如果置信度大于或等于最小置信度,则输出关联规则。
def create_C1(dataset): C1 = [] for transaction in dataset: for item in transaction: if not [item] in C1: C1.append([item]) C1.sort() return list(map(frozenset, C1)) # print(len(create_C1(dataset))) def scan_D(D, Ck, min_support): ssCnt = {} for tid in D: for can in Ck: if can.issubset(tid): if not can in ssCnt: ssCnt[can] = 1 else: ssCnt[can] += 1 num_items = float(len(D)) retList = [] support_data = {} for key in ssCnt: support = ssCnt[key] / num_items if support >= min_support: retList.insert(0, key) support_data[key] = support return retList, support_data def apriori_gen(Lk, k): retList = [] len_Lk = len(Lk) for i in range(len_Lk): for j in range(i + 1, len_Lk): L1 = list(Lk[i])[:k - 2] L2 = list(Lk[j])[:k - 2] L1.sort() L2.sort() if L1 == L2: retList.append(Lk[i] | Lk[j]) return retList def apriori(dataset, min_support): C1 = create_C1(dataset) D = list(map(set, dataset)) L1, support_data = scan_D(D, C1, min_support) L = [L1] k = 2 while len(L[k - 2]) > 0: Ck = apriori_gen(L[k - 2], k) Lk, support_k = scan_D(D, Ck, min_support) support_data.update(support_k) L.append(Lk) k += 1 return L, support_data如果我想在这段代码中加入置信度,怎么加
在这段代码中加入置信度,实际上是要实现关联规则算法。在Apriori算法的基础上,可以通过以下步骤来实现关联规则算法:
1. 从频繁项集L中生成候选规则集合,即对于每个频繁项集,生成包含该项集中所有元素的所有子集。
2. 对于每个候选规则,计算其置信度,并筛选出满足最小置信度要求的规则。
下面是修改后的代码实现:
```python
def generate_rules(L, support_data, min_conf):
rules_list = []
for i in range(1, len(L)):
for freq_set in L[i]:
H1 = [frozenset([item]) for item in freq_set]
if i > 1:
rules_from_conseq(freq_set, H1, support_data, rules_list, min_conf)
else:
calc_conf(freq_set, H1, support_data, rules_list, min_conf)
return rules_list
def calc_conf(freq_set, H, support_data, brl, min_conf):
pruned_h = []
for conseq in H:
conf = support_data[freq_set] / support_data[freq_set - conseq]
if conf >= min_conf:
print(freq_set - conseq, '-->', conseq, 'conf:', conf)
brl.append((freq_set - conseq, conseq, conf))
pruned_h.append(conseq)
return pruned_h
def rules_from_conseq(freq_set, H, support_data, brl, min_conf):
m = len(H[0])
if len(freq_set) > (m + 1):
Hmp1 = apriori_gen(H, m + 1)
Hmp1 = calc_conf(freq_set, Hmp1, support_data, brl, min_conf)
if len(Hmp1) > 1:
rules_from_conseq(freq_set, Hmp1, support_data, brl, min_conf)
def apriori(dataset, min_support, min_conf):
C1 = create_C1(dataset)
D = list(map(set, dataset))
L1, support_data = scan_D(D, C1, min_support)
L = [L1]
k = 2
while len(L[k - 2]) > 0:
Ck = apriori_gen(L[k - 2], k)
Lk, support_k = scan_D(D, Ck, min_support)
support_data.update(support_k)
L.append(Lk)
k += 1
rules_list = generate_rules(L, support_data, min_conf)
return L, support_data, rules_list
```
其中,generate_rules函数用于生成关联规则,calc_conf函数用于计算规则的置信度,rules_from_conseq函数用于从频繁项集中生成候选规则,apriori函数用于调用Apriori算法和关联规则算法,并返回频繁项集、支持度数据和关联规则列表。在调用apriori函数时,需要传入最小支持度和最小置信度阈值。
阅读全文