随机森林的代码,不要调sklearn包
时间: 2023-07-03 07:03:33 浏览: 92
好的,以下是一个简单的随机森林实现的代码,不使用sklearn包:
```python
import numpy as np
import pandas as pd
class RandomForest:
def __init__(self, n_estimators=10, max_depth=None, min_samples_split=2):
self.n_estimators = n_estimators
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.trees = []
def fit(self, X, y):
for i in range(self.n_estimators):
tree = DecisionTree(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
sample_indices = np.random.choice(len(X), len(X), replace=True)
X_sample = X[sample_indices]
y_sample = y[sample_indices]
tree.fit(X_sample, y_sample)
self.trees.append(tree)
def predict(self, X):
predictions = np.zeros((X.shape[0], len(self.trees)))
for i, tree in enumerate(self.trees):
predictions[:, i] = tree.predict(X)
return np.mean(predictions, axis=1)
class DecisionTree:
def __init__(self, max_depth=None, min_samples_split=2):
self.max_depth = max_depth
self.min_samples_split = min_samples_split
def fit(self, X, y):
self.tree = self.build_tree(X, y, depth=0)
def build_tree(self, X, y, depth):
n_samples, n_features = X.shape
n_labels = len(np.unique(y))
if depth == self.max_depth or n_samples < self.min_samples_split or n_labels == 1:
leaf_value = self.get_leaf_value(y)
return Node(value=leaf_value)
feature_indices = np.random.choice(n_features, int(np.sqrt(n_features)), replace=False)
best_feature, best_threshold = self.get_best_split(X, y, feature_indices)
left_indices, right_indices = self.split(X[:, best_feature], best_threshold)
left_subtree = self.build_tree(X[left_indices], y[left_indices], depth+1)
right_subtree = self.build_tree(X[right_indices], y[right_indices], depth+1)
return Node(best_feature, best_threshold, left_subtree, right_subtree)
def get_best_split(self, X, y, feature_indices):
best_gain = -np.inf
split_index, split_threshold = None, None
for feature_index in feature_indices:
X_feature = X[:, feature_index]
thresholds = np.unique(X_feature)
for threshold in thresholds:
gain = self.get_information_gain(y, X_feature, threshold)
if gain > best_gain:
best_gain = gain
split_index = feature_index
split_threshold = threshold
return split_index, split_threshold
def get_information_gain(self, y, X_feature, threshold):
parent_entropy = self.get_entropy(y)
left_indices, right_indices = self.split(X_feature, threshold)
n_left, n_right = len(left_indices), len(right_indices)
if n_left == 0 or n_right == 0:
return 0
left_entropy = self.get_entropy(y[left_indices])
right_entropy = self.get_entropy(y[right_indices])
child_entropy = (n_left/len(y))*left_entropy + (n_right/len(y))*right_entropy
return parent_entropy - child_entropy
def get_entropy(self, y):
_, counts = np.unique(y, return_counts=True)
probabilities = counts / len(y)
entropy = -np.sum(probabilities * np.log2(probabilities))
return entropy
def split(self, X_feature, threshold):
left_indices = np.argwhere(X_feature <= threshold).flatten()
right_indices = np.argwhere(X_feature > threshold).flatten()
return left_indices, right_indices
def get_leaf_value(self, y):
_, counts = np.unique(y, return_counts=True)
return np.argmax(counts)
class Node:
def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None):
self.feature_index = feature_index
self.threshold = threshold
self.left = left
self.right = right
self.value = value
def is_leaf_node(self):
return self.value is not None
def predict(self, x):
if self.is_leaf_node():
return self.value
if x[self.feature_index] <= self.threshold:
return self.left.predict(x)
else:
return self.right.predict(x)
```
注意,这个实现只实现了随机森林的基本功能,并没有包括sklearn中的其他功能,例如特征重要性评估,超参数调整等。
阅读全文