#根据编程要求,补充下面Begin-End区间的代码 import numpy as np import pandas as pd import numpy as np import pandas as pd from sklearn.tree import DecisionTreeClassifier, export_graphviz # 导入决策树模型 from sklearn.model_selection import train_test_split # 导入数据集划分模块 import matplotlib.pyplot as plt from sklearn.metrics import roc_auc_score from sklearn.metrics import classification_report # 数据的读入与处理 data_path ='/data/bigfiles/7db918ff-d514-49ea-8f6b-ea968df742e9' df = pd.read_csv(data_path,header=None,names=['age', 'workclass', 'fnlwgt', 'education', 'education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','salary']) ######Begin ###### # 去除字符串数值前面的空格 # 去除fnlwgt, capital-gain, capital-loss,特征属性 # 将特征采用哑变量进行编码,字符型特征经过转化可以进行训练 # 将label编码 ###### End ###### ######Begin ###### # 按4:1的比例划分训练和测试集 # 构建模型 #对训练集X_train训练 #对于测试集x_test进行预测 # 其他指标计算 print(classification_report(y_test, x_pre_test)) # 预测测试集概率值 #计算验证集的auc值,参数为预测值和概率估计 ###### End ###### print("auc的值:{}".format(auc))
时间: 2024-01-17 20:05:20 浏览: 104
######Begin ######
# 去除字符串数值前面的空格
df_obj = df.select_dtypes(['object'])
df[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())
# 去除fnlwgt, capital-gain, capital-loss,特征属性
df = df.drop(['fnlwgt', 'capital-gain', 'capital-loss'], axis=1)
# 将特征采用哑变量进行编码,字符型特征经过转化可以进行训练
df = pd.get_dummies(df)
# 将label编码
df['salary'] = df['salary'].apply(lambda x: 1 if x == '>50K' else 0)
###### End ######
######Begin ######
# 按4:1的比例划分训练和测试集
X = df.drop(['salary'], axis=1)
y = df['salary']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 构建模型
dt = DecisionTreeClassifier(random_state=42)
# 对训练集X_train训练
dt.fit(X_train, y_train)
# 对于测试集x_test进行预测
x_pre_test = dt.predict(X_test)
# 其他指标计算
print(classification_report(y_test, x_pre_test))
# 预测测试集概率值
y_score = dt.predict_proba(X_test)[:, 1]
# 计算验证集的auc值,参数为预测值和概率估计
auc = roc_auc_score(y_test, y_score)
###### End ######
print("auc的值:{}".format(auc))
阅读全文