自己构建随机森林算法代码
时间: 2023-06-30 18:04:00 浏览: 90
随机森林算法是一种集成学习算法,它通过集成多个决策树的分类结果来提高预测准确率。下面是一个简单的随机森林算法的实现代码示例:
```python
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
class RandomForest:
def __init__(self, n_trees=100, max_depth=10, max_features=None):
self.n_trees = n_trees # 随机森林中树的数量
self.max_depth = max_depth # 每棵树的最大深度
self.max_features = max_features # 每棵树使用的最大特征数
self.trees = []
def fit(self, X, y):
for i in range(self.n_trees):
tree = DecisionTree(max_depth=self.max_depth, max_features=self.max_features)
# 通过随机采样获取样本集和特征集
sample_idx = np.random.choice(X.shape[0], size=X.shape[0], replace=True)
feature_idx = np.random.choice(X.shape[1], size=self.max_features, replace=False)
X_sample = X[sample_idx][:, feature_idx]
y_sample = y[sample_idx]
tree.fit(X_sample, y_sample)
self.trees.append(tree)
def predict(self, X):
y_pred = np.zeros(X.shape[0])
for tree in self.trees:
y_pred += tree.predict(X[:, tree.feature_idx])
return np.round(y_pred / len(self.trees))
class DecisionTree:
def __init__(self, max_depth=None, max_features=None):
self.max_depth = max_depth # 最大深度
self.max_features = max_features # 每个节点随机选取的最大特征数
self.tree = None
self.feature_idx = None
def fit(self, X, y):
self.tree = self.build_tree(X, y, depth=0)
def build_tree(self, X, y, depth):
n_samples, n_features = X.shape
n_classes = len(np.unique(y))
# 如果当前节点的深度达到了最大深度或者样本中只有一个类别,那么直接返回叶子节点
if depth == self.max_depth or n_classes == 1:
return np.bincount(y).argmax()
# 随机选取一些特征
self.feature_idx = np.random.choice(n_features, size=self.max_features, replace=False)
# 从这些特征中选取最优特征及其对应的阈值
best_feature, best_threshold = self.get_best_split(X[:, self.feature_idx], y)
# 如果无法划分当前节点,那么直接返回叶子节点
if best_feature is None or best_threshold is None:
return np.bincount(y).argmax()
# 递归构建左右子树
left_idx = X[:, best_feature] <= best_threshold
right_idx = X[:, best_feature] > best_threshold
left_tree = self.build_tree(X[left_idx], y[left_idx], depth+1)
right_tree = self.build_tree(X[right_idx], y[right_idx], depth+1)
return {'feature': self.feature_idx[best_feature], 'threshold': best_threshold,
'left': left_tree, 'right': right_tree}
def get_best_split(self, X, y):
best_score = -1
best_feature = None
best_threshold = None
for i in range(X.shape[1]):
thresholds = np.unique(X[:, i])
for threshold in thresholds:
score = self.gini_impurity(y, X[:, i], threshold)
if score > best_score:
best_score = score
best_feature = i
best_threshold = threshold
return best_feature, best_threshold
def gini_impurity(self, y, feature, threshold):
left_idx = feature <= threshold
right_idx = feature > threshold
left_y = y[left_idx]
right_y = y[right_idx]
n_total = len(y)
n_left = len(left_y)
n_right = len(right_y)
gini_left = 1 - sum((np.bincount(left_y) / n_left) ** 2) if n_left > 0 else 0
gini_right = 1 - sum((np.bincount(right_y) / n_right) ** 2) if n_right > 0 else 0
return (n_left / n_total) * gini_left + (n_right / n_total) * gini_right
def predict(self, X):
y_pred = np.zeros(X.shape[0])
for i, x in enumerate(X):
node = self.tree
while isinstance(node, dict):
if x[node['feature']] <= node['threshold']:
node = node['left']
else:
node = node['right']
y_pred[i] = node
return y_pred
```
上述代码使用了一个简单的决策树实现,然后通过随机采样、随机特征选择等方式构建了一个随机森林分类器。在使用时,你可以像下面这样使用:
```python
from sklearn.datasets import load_iris
# 加载数据集
iris = load_iris()
X, y = iris.data, iris.target
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 构建随机森林模型
rf = RandomForest(n_trees=100, max_depth=10, max_features=3)
rf.fit(X_train, y_train)
# 在测试集上进行预测
y_pred = rf.predict(X_test)
# 计算准确率
accuracy = np.sum(y_pred == y_test) / len(y_test)
print('Accuracy:', accuracy)
```
阅读全文