import numpy as np import pandas as pd import matplotlib.pyplot as plt from decision_tree_classifier import DecisionTreeClassifier from random_forest_classifier import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score #读取数据 df = pd.read_csv('adult.csv',encoding='gbk') df.head() col_names=['age','workclass','fnlwgt','education','educational-num','marital-status','occupation','relationship','race','gender','capital-gain','capital-loss','hours-per-week','native-country','income'] df.columns = col_names categorical = ['workclass','education','marital-status','occupation','relationship','race','gender','native-country','income'] # print(f'分类特征:\n{categorical}') # for var in categorical: # print(df[var].value_counts()) #缺失值处理 df['occupation'].replace('?', np.NaN, inplace=True) df['workclass'].replace('?', np.NaN, inplace=True) df['native-country'].replace('?', np.NaN, inplace=True) df.isnull().sum() df['income'].value_counts() plt.rcParams['font.sans-serif'] = ['Microsoft YaHei'] df.isnull().sum() df['workclass'].fillna(df['workclass'].mode()[0], inplace=True) df['occupation'].fillna(df['occupation'].mode()[0], inplace=True) df['native-country'].fillna(df['native-country'].mode()[0], inplace=True) df = pd.get_dummies(df,columns=categorical,drop_first=True) print(df.head()) y = df.loc[:,'income_>50K'] X = np.array(df.loc[:,['age', 'educational-num', 'hours-per-week']]) y = np.array(y) x = np.array(X) y = y.reshape(-1,1) X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1234) from sklearn.ensemble import RandomForestClassifier rtree = RandomForestClassifier(n_estimators=100,max_depth=5,max_features=0.2,max_samples=50,random_state=1234) X_train = np.array(X_train) rtree.fit(X_train, y_train) X_test = np.array(X_test) y_pred = rtree.predict(X_test) accuracy = accuracy_score(y_test,y_pred) print("accuracy={}".format((accuracy)))我这个代码如何更换特征向量
时间: 2023-08-04 07:08:11 浏览: 57
如果想更换特征向量,只需要修改以下代码段:
```
X = np.array(df.loc[:,['age', 'educational-num', 'hours-per-week']])
y = np.array(y)
```
将 `['age', 'educational-num', 'hours-per-week']` 替换为你想要的特征向量即可。注意要保证特征向量的格式正确,即为一个列表形式。另外,如果更换特征向量后发现模型表现不佳,可以再考虑增加或减少特征,或者对特征进行预处理等操作。
相关问题
import numpy as np import matplotlib.pyplot as plt import pandas as pd path="https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data" headernames=['sepal-length','sepal-width','petal-length','petal-width','Class'] dataset=pd.read_csv(path,names=headernames) dataset.head() X=dataset.iloc[:,:-1].values y=dataset.iloc[:,4].values from sklearn.model_selection import train_test_split X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.40) from sklearn.neighbors import KNeighborsClassifier classifier=KNeighborsClassifier(n_neighbors=6) classifier.fit(X_train,y_train) y_pred=classifier.predict(X_test) from sklearn.metrics import classification_report,confusion_matrix,accuracy_score result = confusion_matrix(y_test,y_pred) print("Confusion Matrix:") print(result) result1=classification_report(y_test,y_pred) print(result1) result2=accuracy_score(y_test,y_pred) print("Accuracy:",result2)
这是一个基于鸢尾花数据集的KNN分类器的Python代码,其目的是将数据集分为三个不同的类别。代码首先导入了必要的库,然后读取鸢尾花数据集并将其存储在一个Pandas DataFrame中。接下来,将数据集分成训练集和测试集,并使用KNN分类器对训练集进行训练。最后,使用测试集对训练好的分类器进行测试,并计算分类器的准确性。输出包括混淆矩阵、分类报告和准确性的评估。
学生成绩预测模型_逻辑回归实战练习——根据学生成绩预测是否被录取,使用sigmoid函数
本实战练习的目的是通过逻辑回归模型预测一个学生是否被大学录取,使用的数据集是成绩单和录取情况。使用sigmoid函数将预测值转换为0到1之间的概率值,概率越高表示被录取的可能性越大。
步骤如下:
1. 导入所需的库
```python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
```
2. 导入数据集
```python
data = pd.read_csv('admission.csv')
data.head()
```
3. 数据预处理
将数据集分为特征和目标变量,将目标变量“Admitted”转换为0和1。
```python
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values
y = np.where(y=='Admitted', 1, 0)
```
拆分数据集成训练集和测试集。
```python
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
```
特征缩放:
```python
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
```
4. 训练逻辑回归模型
```python
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0, solver='lbfgs')
classifier.fit(X_train, y_train)
```
5. 预测测试集结果
```python
y_pred = classifier.predict(X_test)
```
6. 评估模型精度
```python
from sklearn.metrics import accuracy_score, confusion_matrix
accuracy_score(y_test, y_pred)
confusion_matrix(y_test, y_pred)
```
7. 绘制决策边界
```python
from matplotlib.colors import ListedColormap
X_set, y_set = sc.inverse_transform(X_train), y_train
X1, X2 = np.meshgrid(np.arange(start=X_set[:, 0].min() - 10, stop=X_set[:, 0].max() + 10, step=0.1),
np.arange(start=X_set[:, 1].min() - 100, stop=X_set[:, 1].max() + 100, step=0.1))
plt.contourf(X1, X2, classifier.predict(sc.transform(np.array([X1.ravel(), X2.ravel()]).T)).reshape(X1.shape),
alpha=0.75, cmap=ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
c=ListedColormap(('red', 'green'))(i), label=j)
plt.title('Logistic Regression (Training set)')
plt.xlabel('Exam 1 score')
plt.ylabel('Exam 2 score')
plt.legend()
plt.show()
```
8. 生成一个新的输入数据,并预测它的录取概率
```python
new_data = [[45, 85], [30, 45], [85, 90], [70, 70]]
new_data = sc.transform(new_data)
new_pred = classifier.predict_proba(new_data)
print(new_pred)
```
输出为:
```
[[0.75930324 0.24069676]
[0.9770395 0.0229605 ]
[0.01242469 0.98757531]
[0.16356001 0.83643999]]
```
以上输出表示:对于每组新的输入数据,第一个数字表示未录取的概率,第二个数字表示录取的概率。