from apriori import * # 编写代码实现关联规则抽取 dataset = load_data('data/apriori.txt') # 定义 generate_big_rules函数来获取关联规则 def generate_big_rules(L, support_data, min_conf): big_rule_list = [] sub_set_list = [] for i in range(0, len(L)): for freq_set in L[i]: # freq_set:('B4')、('B4', 'C4', 'H4') for sub_set in sub_set_list: #**************** BEGIN ***************** if sub_set.issubset(freq_set): # 计算置信度 # 前件、后件、支持度、置信度 #**************** END ******************* sub_set_list.append(freq_set) return big_rule_list def task(): L, support_data = generate_L(dataset, k=4, min_support=0.06) # 根据频繁项集寻找关联规则,设置置信度为 0.75 big_rules_list = generate_big_rules(L, support_data, min_conf=0.75) return big_rules_list
时间: 2023-07-19 09:55:35 浏览: 52
这段代码的功能是实现关联规则抽取,具体步骤如下:
1. 使用apriori.py中的load_data方法从文件中读入经过离散化处理后的数据集。
2. 定义generate_big_rules函数来获取关联规则。该函数的输入参数包括频繁项集L、支持度信息support_data和最小置信度min_conf。
3. 在函数中,通过两层循环遍历频繁项集L,对于每个频繁项freq_set,再遍历它的子集sub_set_list,判断子集是否是freq_set的子集,如果是,则计算置信度并保存关联规则。
4. 最后返回关联规则列表big_rule_list。
5. 在task函数中,调用generate_L方法得到频繁项集L和支持度信息support_data,然后调用generate_big_rules方法得到关联规则列表big_rules_list,并将其作为返回值。
需要注意的是,该代码中的generate_L函数没有给出,因此可能需要在其他地方进行定义。
相关问题
从外部录入数据集不用apriori库的关联规则apriori算法代码
以下是一个简单的关联规则Apriori算法的Python实现,可以从外部读取数据集文件。这个实现仅用于学习和演示目的,对于大规模数据集,建议使用更高效的实现。
```python
import itertools
# 读取数据集
def load_dataset(filename):
with open(filename, 'r') as f:
dataset = [list(map(int, line.strip().split())) for line in f.readlines()]
return dataset
# 计算项集的支持度
def support_count(dataset, itemset):
count = 0
for item in dataset:
if set(itemset).issubset(item):
count += 1
return count
# 生成候选项集
def generate_candidate_itemsets(dataset, k):
candidate_itemsets = []
for item in dataset:
for combination in itertools.combinations(item, k):
if list(combination) not in candidate_itemsets:
candidate_itemsets.append(list(combination))
return candidate_itemsets
# 生成频繁项集
def generate_frequent_itemsets(dataset, min_support):
frequent_itemsets = []
k = 1
while True:
candidate_itemsets = generate_candidate_itemsets(dataset, k)
frequent_itemsets_k = []
for itemset in candidate_itemsets:
support = support_count(dataset, itemset)
if support >= min_support:
frequent_itemsets_k.append(itemset)
if len(frequent_itemsets_k) == 0:
break
frequent_itemsets += frequent_itemsets_k
k += 1
return frequent_itemsets
# 生成关联规则
def generate_association_rules(frequent_itemsets, min_confidence):
association_rules = []
for itemset in frequent_itemsets:
for i in range(1, len(itemset)):
for antecedent in itertools.combinations(itemset, i):
antecedent = list(antecedent)
consequent = list(set(itemset) - set(antecedent))
support_antecedent = support_count(dataset, antecedent)
support_consequent = support_count(dataset, consequent)
confidence = support_count(dataset, itemset) / support_antecedent
if confidence >= min_confidence:
association_rules.append((antecedent, consequent, confidence))
return association_rules
if __name__ == '__main__':
# 设置最小支持度和最小置信度
min_support = 2
min_confidence = 0.5
# 读取数据集
dataset = load_dataset('dataset.txt')
# 生成频繁项集
frequent_itemsets = generate_frequent_itemsets(dataset, min_support)
print('Frequent Itemsets:')
print(frequent_itemsets)
# 生成关联规则
association_rules = generate_association_rules(frequent_itemsets, min_confidence)
print('Association Rules:')
for antecedent, consequent, confidence in association_rules:
print('{} => {}: {:.2f}'.format(antecedent, consequent, confidence))
```
在这个实现中,`load_dataset()`函数从外部文件中读取数据集,`support_count()`函数计算项集的支持度,`generate_candidate_itemsets()`函数生成候选项集,`generate_frequent_itemsets()`函数生成频繁项集,`generate_association_rules()`函数生成关联规则。您可以根据自己的需要调整代码。
def create_C1(dataset): C1 = [] for transaction in dataset: for item in transaction: if not [item] in C1: C1.append([item]) C1.sort() return list(map(frozenset, C1)) # print(len(create_C1(dataset))) def scan_D(D, Ck, min_support): ssCnt = {} for tid in D: for can in Ck: if can.issubset(tid): if not can in ssCnt: ssCnt[can] = 1 else: ssCnt[can] += 1 num_items = float(len(D)) retList = [] support_data = {} for key in ssCnt: support = ssCnt[key] / num_items if support >= min_support: retList.insert(0, key) support_data[key] = support return retList, support_data def apriori_gen(Lk, k): retList = [] len_Lk = len(Lk) for i in range(len_Lk): for j in range(i + 1, len_Lk): L1 = list(Lk[i])[:k - 2] L2 = list(Lk[j])[:k - 2] L1.sort() L2.sort() if L1 == L2: retList.append(Lk[i] | Lk[j]) return retList def apriori(dataset, min_support): C1 = create_C1(dataset) D = list(map(set, dataset)) L1, support_data = scan_D(D, C1, min_support) L = [L1] k = 2 while len(L[k - 2]) > 0: Ck = apriori_gen(L[k - 2], k) Lk, support_k = scan_D(D, Ck, min_support) support_data.update(support_k) L.append(Lk) k += 1 return L, support_data如果我想在这段代码中加入置信度,怎么加
在这段代码中加入置信度,实际上是要实现关联规则算法。在Apriori算法的基础上,可以通过以下步骤来实现关联规则算法:
1. 从频繁项集L中生成候选规则集合,即对于每个频繁项集,生成包含该项集中所有元素的所有子集。
2. 对于每个候选规则,计算其置信度,并筛选出满足最小置信度要求的规则。
下面是修改后的代码实现:
```python
def generate_rules(L, support_data, min_conf):
rules_list = []
for i in range(1, len(L)):
for freq_set in L[i]:
H1 = [frozenset([item]) for item in freq_set]
if i > 1:
rules_from_conseq(freq_set, H1, support_data, rules_list, min_conf)
else:
calc_conf(freq_set, H1, support_data, rules_list, min_conf)
return rules_list
def calc_conf(freq_set, H, support_data, brl, min_conf):
pruned_h = []
for conseq in H:
conf = support_data[freq_set] / support_data[freq_set - conseq]
if conf >= min_conf:
print(freq_set - conseq, '-->', conseq, 'conf:', conf)
brl.append((freq_set - conseq, conseq, conf))
pruned_h.append(conseq)
return pruned_h
def rules_from_conseq(freq_set, H, support_data, brl, min_conf):
m = len(H[0])
if len(freq_set) > (m + 1):
Hmp1 = apriori_gen(H, m + 1)
Hmp1 = calc_conf(freq_set, Hmp1, support_data, brl, min_conf)
if len(Hmp1) > 1:
rules_from_conseq(freq_set, Hmp1, support_data, brl, min_conf)
def apriori(dataset, min_support, min_conf):
C1 = create_C1(dataset)
D = list(map(set, dataset))
L1, support_data = scan_D(D, C1, min_support)
L = [L1]
k = 2
while len(L[k - 2]) > 0:
Ck = apriori_gen(L[k - 2], k)
Lk, support_k = scan_D(D, Ck, min_support)
support_data.update(support_k)
L.append(Lk)
k += 1
rules_list = generate_rules(L, support_data, min_conf)
return L, support_data, rules_list
```
其中,generate_rules函数用于生成关联规则,calc_conf函数用于计算规则的置信度,rules_from_conseq函数用于从频繁项集中生成候选规则,apriori函数用于调用Apriori算法和关联规则算法,并返回频繁项集、支持度数据和关联规则列表。在调用apriori函数时,需要传入最小支持度和最小置信度阈值。