基于鸢尾花数据集的CART分类与回归算法python实现
时间: 2023-08-02 07:06:20 浏览: 96
NJUSE 大数据分析 课程作业,基于python实现使用多种方法分类鸢尾花数据集,推荐算法,MovieLens数据集
以下是基于鸢尾花数据集的CART分类与回归算法的Python实现:
1. 导入必要的库和数据集
```python
import pandas as pd
from sklearn.datasets import load_iris
# 加载鸢尾花数据集
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.DataFrame(iris.target, columns=['target'])
```
2. 定义CART分类树和回归树的函数
```python
def find_best_split(df, target):
"""
找到最佳分裂点
"""
best_gini = 1
best_feature = None
best_value = None
# 计算当前节点的Gini指数
gini = calc_gini(df[target])
# 遍历每个特征,找到最佳分裂点
for feature in df.columns:
values = set(df[feature])
for value in values:
left = df[df[feature] < value]
right = df[df[feature] >= value]
if len(left) == 0 or len(right) == 0:
continue
cur_gini = (len(left)/len(df))*calc_gini(left[target]) + (len(right)/len(df))*calc_gini(right[target])
if cur_gini < best_gini:
best_gini = cur_gini
best_feature = feature
best_value = value
return best_feature, best_value
def build_tree(df, target):
"""
构建决策树
"""
# 如果只有一个类别,返回该类别
if len(set(df[target])) == 1:
return df[target].iloc[0]
# 如果所有特征都相同,返回出现最多的类别
if len(df.columns) == 1:
return df[target].mode()[0]
# 找到最佳分裂点
best_feature, best_value = find_best_split(df, target)
# 构建子树
left = df[df[best_feature] < best_value]
right = df[df[best_feature] >= best_value]
tree = {best_feature: {}}
tree[best_feature]['<' + str(best_value)] = build_tree(left, target)
tree[best_feature]['>=' + str(best_value)] = build_tree(right, target)
return tree
```
3. 计算Gini指数的函数
```python
def calc_gini(y):
"""
计算Gini指数
"""
n = len(y)
counts = y.value_counts()
gini = 1
for count in counts:
p = count/n
gini -= p**2
return gini
```
4. 训练CART分类树
```python
# 合并特征和标签
df = pd.concat([X, y], axis=1)
# 训练CART分类树
tree = build_tree(df, 'target')
```
5. 定义CART回归树的函数
```python
def find_best_split_regression(df, target):
"""
找到最佳分裂点(回归树)
"""
best_mse = float('inf')
best_feature = None
best_value = None
# 计算当前节点的均方误差
mse = calc_mse(df[target])
# 遍历每个特征,找到最佳分裂点
for feature in df.columns:
values = set(df[feature])
for value in values:
left = df[df[feature] < value]
right = df[df[feature] >= value]
if len(left) == 0 or len(right) == 0:
continue
cur_mse = (len(left)/len(df))*calc_mse(left[target]) + (len(right)/len(df))*calc_mse(right[target])
if cur_mse < best_mse:
best_mse = cur_mse
best_feature = feature
best_value = value
return best_feature, best_value
def build_tree_regression(df, target):
"""
构建决策树(回归树)
"""
# 如果所有样本的目标值相同,返回该目标值
if len(set(df[target])) == 1:
return df[target].iloc[0]
# 找到最佳分裂点
best_feature, best_value = find_best_split_regression(df, target)
# 构建子树
left = df[df[best_feature] < best_value]
right = df[df[best_feature] >= best_value]
tree = {best_feature: {}}
tree[best_feature]['<' + str(best_value)] = build_tree_regression(left, target)
tree[best_feature]['>=' + str(best_value)] = build_tree_regression(right, target)
return tree
```
6. 计算均方误差的函数
```python
def calc_mse(y):
"""
计算均方误差
"""
n = len(y)
mse = sum((y - y.mean())**2)/n
return mse
```
7. 训练CART回归树
```python
# 加载波士顿房价数据集
from sklearn.datasets import load_boston
boston = load_boston()
# 将数据集转换为DataFrame
X = pd.DataFrame(boston.data, columns=boston.feature_names)
y = pd.DataFrame(boston.target, columns=['target'])
# 训练CART回归树
df = pd.concat([X, y], axis=1)
tree = build_tree_regression(df, 'target')
```
阅读全文