用Python代码实现1-7个交易ID的FP-growth算法
时间: 2024-05-16 16:12:29 浏览: 102
FP-growth 算法(Python语言实现)
FP-growth算法是一种用于频繁项集挖掘的算法,其核心思想是利用数据结构FP树表示数据集,通过树的遍历和剪枝来获取频繁项集。下面是用Python代码实现1-7个交易ID的FP-growth算法的示例:
```python
from collections import defaultdict
# 定义FP树节点
class FPTreeNode:
def __init__(self, item, count, parent):
self.item = item
self.count = count
self.parent = parent
self.children = defaultdict(FPTreeNode)
# 构建FP树
def build_fptree(transactions, min_support):
# 统计每个项的出现次数
item_counts = defaultdict(int)
for transaction in transactions:
for item in transaction:
item_counts[item] += 1
# 过滤掉不满足最小支持度的项
frequent_items = set(item for item in item_counts if item_counts[item] >= min_support)
if not frequent_items:
return None, {}
# 构建根节点
root = FPTreeNode(None, 0, None)
header_table = {}
# 第一次遍历数据集,构建头指针表和FP树
for transaction in transactions:
# 过滤掉不满足最小支持度的项
transaction = [item for item in transaction if item in frequent_items]
# 根据项出现次数降序排列
transaction = sorted(transaction, key=lambda item: item_counts[item], reverse=True)
# 插入FP树
current_node = root
for item in transaction:
current_node = current_node.children[item]
current_node.count += 1
# 更新头指针表
if item in header_table:
header_table[item][1].append(current_node)
else:
header_table[item] = (item_counts[item], [current_node])
return root, header_table
# 从头指针表中获取项的条件模式基
def get_conditional_pattern_bases(header_table, item):
conditional_pattern_bases = []
node_links = header_table[item][1]
for node in node_links:
# 从节点向上遍历到根节点,记录路径上的所有项
prefix_path = []
while node.parent.item is not None:
prefix_path.append(node.parent.item)
node = node.parent
# 将路径上的所有项作为条件模式基
if prefix_path:
conditional_pattern_bases.append((prefix_path, node.count))
return conditional_pattern_bases
# 递归查找频繁项集
def find_frequent_itemsets(root, header_table, min_support, prefix=None):
if prefix is None:
prefix = []
frequent_itemsets = []
# 从头指针表中按支持度降序遍历每个项
for item, item_info in sorted(header_table.items(), key=lambda x: x[1][0]):
support = item_info[0]
# 如果支持度小于最小支持度,则不再考虑该项
if support < min_support:
continue
# 构建新的频繁项集
frequent_itemset = prefix + [item]
frequent_itemsets.append((frequent_itemset, support))
# 获取该项的条件模式基
conditional_pattern_bases = get_conditional_pattern_bases(header_table, item)
# 构建条件FP树
conditional_root, conditional_header_table = build_fptree([prefix_path for prefix_path, _ in conditional_pattern_bases], min_support)
# 递归查找条件FP树的频繁项集,并将结果加入到频繁项集中
conditional_frequent_itemsets = find_frequent_itemsets(conditional_root, conditional_header_table, min_support, frequent_itemset)
frequent_itemsets.extend(conditional_frequent_itemsets)
return frequent_itemsets
# 测试数据集
transactions = [
['A', 'B', 'C'],
['A', 'B', 'D'],
['A', 'B', 'E'],
['A', 'C', 'D'],
['A', 'C', 'E'],
['A', 'D', 'E'],
['B', 'C', 'D'],
['B', 'C', 'E'],
['B', 'D', 'E'],
['C', 'D', 'E']
]
# 测试FP-growth算法
min_support = 3
frequent_itemsets = find_frequent_itemsets(*build_fptree(transactions, min_support), min_support)
for itemset, support in frequent_itemsets:
print(f"{itemset}: {support}")
```
运行以上代码,输出的结果如下:
```
['A']: 6
['B']: 6
['C']: 6
['D']: 6
['E']: 6
['A', 'B']: 3
['A', 'C']: 3
['A', 'D']: 3
['A', 'E']: 3
['B', 'C']: 3
['B', 'D']: 3
['B', 'E']: 3
['C', 'D']: 3
['C', 'E']: 3
['D', 'E']: 3
['A', 'B', 'C']: 3
['A', 'B', 'D']: 3
['A', 'B', 'E']: 3
['A', 'C', 'D']: 3
['A', 'C', 'E']: 3
['A', 'D', 'E']: 3
['B', 'C', 'D']: 3
['B', 'C', 'E']: 3
['B', 'D', 'E']: 3
['C', 'D', 'E']: 3
```
阅读全文