1. 利用随机森林求出Titanic数据集的训练集及测试集预测准确率。 2. 跑出影响因素的重要性排序图。 3. 跑出随机森林收敛图。 4. 将代码列出。
时间: 2024-03-01 09:53:42 浏览: 35
1. 利用随机森林求出Titanic数据集的训练集及测试集预测准确率的代码如下:
```python
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
# 读取数据
data = pd.read_csv('titanic.csv')
# 筛选特征和标签
features = data[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex', 'Embarked']]
target = data['Survived']
# 特征处理
features['Age'] = features['Age'].fillna(features['Age'].mean())
features['Fare'] = features['Fare'].fillna(features['Fare'].mean())
features['Sex'] = features['Sex'].map({'male': 0, 'female': 1})
features['Embarked'] = features['Embarked'].fillna('S')
features['Embarked'] = features['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})
# 划分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
# 随机森林模型
rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rf.fit(x_train, y_train)
# 预测结果
y_train_pred = rf.predict(x_train)
y_test_pred = rf.predict(x_test)
# 计算准确率
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)
print("训练集准确率:", train_acc)
print("测试集准确率:", test_acc)
```
2. 跑出影响因素的重要性排序图的代码如下:
```python
import matplotlib.pyplot as plt
# 获取特征重要性
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]
# 绘制特征重要性排序图
plt.figure()
plt.title("Feature importances")
plt.bar(range(features.shape[1]), importances[indices], color="r", align="center")
plt.xticks(range(features.shape[1]), features.columns[indices], rotation=90)
plt.xlim([-1, features.shape[1]])
plt.show()
```
3. 跑出随机森林收敛图的代码如下:
```python
import numpy as np
# 随机森林的oob误差
n_estimators = [1, 2, 4, 8, 16, 32, 64, 100, 200]
train_accs = []
test_accs = []
for n_estimator in n_estimators:
rf = RandomForestClassifier(n_estimators=n_estimator, max_depth=5, oob_score=True, random_state=42)
rf.fit(x_train, y_train)
train_accs.append(rf.score(x_train, y_train))
test_accs.append(rf.score(x_test, y_test))
plt.figure()
plt.plot(n_estimators, train_accs, '-o', label='train')
plt.plot(n_estimators, test_accs, '-o', label='test')
plt.xlabel('n_estimators')
plt.ylabel('accuracy')
plt.legend()
plt.show()
```
4. 完整代码如下:
```python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
# 读取数据
data = pd.read_csv('titanic.csv')
# 筛选特征和标签
features = data[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex', 'Embarked']]
target = data['Survived']
# 特征处理
features['Age'] = features['Age'].fillna(features['Age'].mean())
features['Fare'] = features['Fare'].fillna(features['Fare'].mean())
features['Sex'] = features['Sex'].map({'male': 0, 'female': 1})
features['Embarked'] = features['Embarked'].fillna('S')
features['Embarked'] = features['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})
# 划分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
# 随机森林模型
rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rf.fit(x_train, y_train)
# 预测结果
y_train_pred = rf.predict(x_train)
y_test_pred = rf.predict(x_test)
# 计算准确率
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)
print("训练集准确率:", train_acc)
print("测试集准确率:", test_acc)
# 获取特征重要性
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]
# 绘制特征重要性排序图
plt.figure()
plt.title("Feature importances")
plt.bar(range(features.shape[1]), importances[indices], color="r", align="center")
plt.xticks(range(features.shape[1]), features.columns[indices], rotation=90)
plt.xlim([-1, features.shape[1]])
plt.show()
# 随机森林的oob误差
n_estimators = [1, 2, 4, 8, 16, 32, 64, 100, 200]
train_accs = []
test_accs = []
for n_estimator in n_estimators:
rf = RandomForestClassifier(n_estimators=n_estimator, max_depth=5, oob_score=True, random_state=42)
rf.fit(x_train, y_train)
train_accs.append(rf.score(x_train, y_train))
test_accs.append(rf.score(x_test, y_test))
plt.figure()
plt.plot(n_estimators, train_accs, '-o', label='train')
plt.plot(n_estimators, test_accs, '-o', label='test')
plt.xlabel('n_estimators')
plt.ylabel('accuracy')
plt.legend()
plt.show()
```
相关推荐
![zip](https://img-home.csdnimg.cn/images/20210720083736.png)
![zip](https://img-home.csdnimg.cn/images/20210720083736.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)