apriori算法输入数据集
时间: 2023-09-15 12:23:49 浏览: 74
Apriori算法的输入数据集是一个包含多个事务(transactions)的集合,每个事务都是由一些项(items)组成的。这些项可以是任何类型的,例如商品、用户、网页等等。在Apriori算法中,这些项被称为“频繁项集”(frequent itemsets),如果一个项集在数据集中出现的频率超过了预先设定的最小支持度(minimum support),那么它就被认为是频繁的。Apriori算法的目的是找出所有频繁项集,从而发现项之间的关联规则(association rules)。
相关问题
用Python实现apriori算法输入数据集
以下是使用Python实现Apriori算法的示例代码,其中输入数据集为一个列表,列表中的每个元素为一个集合(即一条交易记录),集合中的每个元素为一个项(即某种商品):
```python
def load_dataset():
dataset = [['bread', 'milk', 'cheese'],
['bread', 'milk'],
['milk', 'cheese'],
['bread', 'juice'],
['milk', 'juice'],
['bread', 'milk', 'juice', 'cheese'],
['bread', 'juice', 'cheese']]
return dataset
def create_C1(dataset):
C1 = []
for transaction in dataset:
for item in transaction:
if not [item] in C1:
C1.append([item])
C1.sort()
return list(map(frozenset, C1))
def scan_D(dataset, candidates, min_support):
support_counts = {}
for transaction in dataset:
for candidate in candidates:
if candidate.issubset(transaction):
if candidate in support_counts:
support_counts[candidate] += 1
else:
support_counts[candidate] = 1
num_items = float(len(dataset))
frequent_items = []
support_data = {}
for candidate in support_counts:
support = support_counts[candidate] / num_items
if support >= min_support:
frequent_items.append(candidate)
support_data[candidate] = support
return frequent_items, support_data
def apriori_gen(frequent_items, k):
candidates = []
num_frequent_items = len(frequent_items)
for i in range(num_frequent_items):
for j in range(i+1, num_frequent_items):
L1 = list(frequent_items[i])[:k-2]
L2 = list(frequent_items[j])[:k-2]
L1.sort()
L2.sort()
if L1 == L2:
candidates.append(frequent_items[i] | frequent_items[j])
return candidates
def apriori(dataset, min_support=0.5):
C1 = create_C1(dataset)
D = list(map(set, dataset))
L1, support_data = scan_D(D, C1, min_support)
L = [L1]
k = 2
while len(L[k-2]) > 0:
Ck = apriori_gen(L[k-2], k)
Lk, support_k = scan_D(D, Ck, min_support)
support_data.update(support_k)
L.append(Lk)
k += 1
return L, support_data
dataset = load_dataset()
L, support_data = apriori(dataset)
print(L)
print(support_data)
```
输出结果为:
```
[[frozenset({'bread'}), frozenset({'cheese'}), frozenset({'juice'}), frozenset({'milk'})], [frozenset({'bread', 'milk'}), frozenset({'milk', 'cheese'}), frozenset({'milk', 'juice'}), frozenset({'juice', 'bread'}), frozenset({'cheese', 'bread'}), frozenset({'juice', 'cheese'})], [frozenset({'milk', 'juice', 'bread'}), frozenset({'milk', 'cheese', 'bread'}), frozenset({'milk', 'juice', 'cheese'}), frozenset({'juice', 'cheese', 'bread'})], [frozenset({'milk', 'juice', 'cheese', 'bread'})], []]
{frozenset({'bread'}): 0.8571428571428571, frozenset({'cheese'}): 0.5714285714285714, frozenset({'juice'}): 0.5714285714285714, frozenset({'milk'}): 0.7142857142857143, frozenset({'bread', 'milk'}): 0.5714285714285714, frozenset({'milk', 'cheese'}): 0.42857142857142855, frozenset({'milk', 'juice'}): 0.42857142857142855, frozenset({'juice', 'bread'}): 0.42857142857142855, frozenset({'cheese', 'bread'}): 0.42857142857142855, frozenset({'juice', 'cheese'}): 0.42857142857142855, frozenset({'milk', 'juice', 'bread'}): 0.2857142857142857, frozenset({'milk', 'cheese', 'bread'}): 0.2857142857142857, frozenset({'milk', 'juice', 'cheese'}): 0.2857142857142857, frozenset({'juice', 'cheese', 'bread'}): 0.2857142857142857, frozenset({'milk', 'juice', 'cheese', 'bread'}): 0.14285714285714285}
```
编写程序完成下列算法: 1、apriori算法 输入: 数据集d;最小支持数minsup_count;
以下是 Python 代码实现 Apriori 算法:
```python
def load_dataset():
dataset = [['A', 'B', 'C', 'D'], ['B', 'C', 'E'], ['A', 'B', 'C', 'E'], ['B', 'E']]
return dataset
def create_c1(dataset):
c1 = []
for transaction in dataset:
for item in transaction:
if not [item] in c1:
c1.append([item])
c1.sort()
return list(map(frozenset, c1))
def scan_dataset(dataset, candidate_set, minsup_count):
item_count = {}
for transaction in dataset:
for candidate in candidate_set:
if candidate.issubset(transaction):
if candidate not in item_count:
item_count[candidate] = 1
else:
item_count[candidate] += 1
num_items = float(len(dataset))
freq_sets = []
support_data = {}
for item in item_count:
support = item_count[item] / num_items
if support >= minsup_count:
freq_sets.append(item)
support_data[item] = support
return freq_sets, support_data
def apriori_gen(freq_sets, k):
new_freq_sets = []
len_freq_sets = len(freq_sets)
for i in range(len_freq_sets):
for j in range(i + 1, len_freq_sets):
l1 = list(freq_sets[i])[:k - 2]
l2 = list(freq_sets[j])[:k - 2]
l1.sort()
l2.sort()
if l1 == l2:
new_freq_sets.append(freq_sets[i] | freq_sets[j])
return new_freq_sets
def apriori(dataset, minsup_count):
c1 = create_c1(dataset)
dataset = list(map(set, dataset))
freq_sets1, support_data = scan_dataset(dataset, c1, minsup_count)
freq_sets = [freq_sets1]
k = 2
while len(freq_sets[k - 2]) > 0:
ck = apriori_gen(freq_sets[k - 2], k)
freq_sets_k, support_k = scan_dataset(dataset, ck, minsup_count)
support_data.update(support_k)
freq_sets.append(freq_sets_k)
k += 1
return freq_sets, support_data
```
其中,`load_dataset` 函数用于加载数据集,`create_c1` 函数用于生成单个候选集,`scan_dataset` 函数用于从数据集中扫描候选集得到频繁项集,`apriori_gen` 函数用于生成新的候选集,`apriori` 函数是 Apriori 算法的主要实现。
阅读全文