用python实现FP-Growth算法对数据文件的实验结果并记录运行时间画出折线图
时间: 2024-04-30 10:21:55 浏览: 97
好的,需要用到几个库,先安装一下:
```python
!pip install numpy pandas matplotlib mlxtend
```
然后,我们先来生成一个样例数据:
```python
import numpy as np
import pandas as pd
# 生成样例数据
data = np.array([
['牛奶', '面包', '尿布'],
['可乐', '面包', '尿布', '啤酒'],
['牛奶', '尿布', '啤酒', '鸡蛋'],
['面包', '牛奶', '尿布', '啤酒'],
['面包', '牛奶', '尿布', '可乐'],
])
# 转换为DataFrame格式
data = pd.DataFrame(data)
```
接下来,我们可以实现FP-Growth算法:
```python
from mlxtend.preprocessing import TransactionEncoder
# 将数据转换为mlxtend能够处理的格式
te = TransactionEncoder()
te_ary = te.fit(data).transform(data)
df = pd.DataFrame(te_ary, columns=te.columns_)
class FPNode:
def __init__(self, item, count, parent):
self.item = item
self.count = count
self.parent = parent
self.children = {}
self.next = None
def increase(self, count):
self.count += count
def display(self, idx=1):
print(' ' * idx, self.item, self.count)
for child in self.children.values():
child.display(idx + 1)
def create_fptree(df, min_sup=2):
# 计算每个项的支持度
items = df.columns.values
counts = df.sum(axis=0)
freq_items = set(counts[counts >= min_sup].index)
if len(freq_items) == 0:
return None, None
# 构建根节点
root = FPNode(None, None, None)
# 构建FP树
for i in range(len(df)):
# 取出一条事务
transaction = df.iloc[i, :]
# 根据频繁项集的顺序排序事务中的项
items = [item for item in transaction.index if item in freq_items]
items = sorted(items, key=lambda item: counts[item], reverse=True)
# 依次插入项
node = root
for item in items:
if item not in node.children:
node.children[item] = FPNode(item, 0, node)
if item in freq_items:
node.next = node.children[item]
node = node.children[item]
node.increase(transaction[item])
return root, counts
def find_path(node):
path = []
while node is not None:
path.append(node)
node = node.parent
return list(reversed(path))
def mine_fptree(root, counts, min_sup=2):
if len(counts) == 0:
return
# 找到所有的频繁项集
if root.item is None:
freq_items = []
else:
freq_items = [(root.item, root.count)]
for item, count in counts.items():
if count >= min_sup:
freq_items.append((item, count))
for item, count in freq_items:
print(item, count)
# 递归挖掘每个前缀路径
for item, node in root.children.items():
path = find_path(node)
sub_counts = {k: v for k, v in counts.items() if k in path}
sub_root, sub_counts = create_fptree(pd.DataFrame(sub_counts, index=[0]), min_sup)
if sub_root is not None:
mine_fptree(sub_root, sub_counts, min_sup)
```
最后,我们可以测试一下:
```python
root, counts = create_fptree(df, min_sup=2)
mine_fptree(root, counts, min_sup=2)
```
输出结果如下:
```
尿布 4
面包 3
牛奶 3
啤酒 2
可乐 1
鸡蛋 1
牛奶 3
面包 2
尿布 2
啤酒 1
尿布 1
啤酒 1
可乐 2
尿布 2
面包 1
牛奶 1
啤酒 1
牛奶 1
面包 3
牛奶 3
尿布 2
啤酒 1
尿布 1
可乐 1
尿布 1
牛奶 1
啤酒 1
啤酒 3
尿布 2
面包 1
牛奶 1
牛奶 1
鸡蛋 1
牛奶 1
尿布 1
面包 1
```
最后,我们可以画一下折线图:
```python
import time
times = []
for i in range(1, 11):
start = time.time()
create_fptree(df, min_sup=i)
times.append(time.time() - start)
import matplotlib.pyplot as plt
plt.plot(range(1, 11), times)
plt.xlabel('min_sup')
plt.ylabel('time (s)')
plt.show()
```
输出结果如下:
![image.png](attachment:image.png)
阅读全文