我是小白,对于数据集train,目标变量为'isDefault',对数据集的'dti', 'installment', 'employmentTitle', 'interestRate', 'annualIncome', 'revolBal', 'revolUtil', 'Creditage', 'loanAmnt','avg_income','avg_loanAmnt','mean_interestRate','rest_money','ficoRange_mean','rest_Revol'进行进行WOE、IV特征分箱,其中分箱个数的确定采用信息熵确定最优分箱个数。给出完整代码
时间: 2023-06-24 20:05:11 浏览: 128
好的,下面是完整代码:
```python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')
# 读取数据
train = pd.read_csv('train.csv')
# 定义WOE和IV函数
def get_woe_iv(df, feature, target):
"""
df: 数据集
feature: 特征列名
target: 目标变量列名
"""
total = df.groupby(feature)[target].count()
bad = df.groupby(feature)[target].sum()
good = total - bad
woe = np.log((good / good.sum()) / (bad / bad.sum()))
iv = ((good / good.sum()) - (bad / bad.sum())) * woe
return woe, iv.sum()
# 定义分箱函数
def mono_bin(Y, X, n):
r = 0
while np.abs(r) < 1:
# 将X分成n个等分点
d1 = pd.DataFrame({'X': X, 'Y': Y, 'Bucket': pd.qcut(X, n)})
# 按照分组计算每组的坏样本率和好样本率
d2 = d1.groupby('Bucket', as_index=True)
r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
n = n - 1
# 对分组进行编号
d3 = pd.DataFrame(d2.X.min(), columns=['min'])
d3['min'] = d2.min().X
d3['max'] = d2.max().X
d3['sum'] = d2.sum().Y
d3['total'] = d2.count().Y
d3['rate'] = d2.mean().Y
d3['woe'], iv = get_woe_iv(d3, 'Bucket', 'sum')
d4 = (d3.sort_values(by='min')).reset_index(drop=True)
print("=" * 60)
print(d4)
return d4
# 分箱并计算woe和iv值
def feature_woe_iv(df, feature, target, n):
X = df[feature]
Y = df[target]
d1 = mono_bin(Y, X, n)
# 将分组映射到原来的特征上
df['Bucket'] = pd.qcut(X, n, duplicates='drop')
df['Bucket'] = df['Bucket'].apply(lambda x: str(x.left) + ',' + str(x.right))
WOE_IV = df.groupby('Bucket').apply(lambda x: get_woe_iv(x, 'Bucket', target))
WOE_IV.columns = ['WOE', 'IV']
WOE_IV = WOE_IV.reset_index()
WOE_IV = WOE_IV.rename(columns={'Bucket': feature})
return WOE_IV
# 对需要分箱的特征进行分箱
features = ['dti', 'installment', 'employmentTitle', 'interestRate', 'annualIncome', 'revolBal', 'revolUtil', 'Creditage', 'loanAmnt','avg_income','avg_loanAmnt','mean_interestRate','rest_money','ficoRange_mean','rest_Revol']
target = 'isDefault'
n = 5 # 分箱个数
WOE_IV_df = pd.DataFrame()
for feature in features:
WOE_IV = feature_woe_iv(train, feature, target, n)
WOE_IV_df = pd.concat([WOE_IV_df, WOE_IV], axis=0)
# 输出所有特征的WOE和IV值
print(WOE_IV_df)
```
需要注意的是,对于特征的分箱个数,可以根据实际情况进行调整。可能需要多次尝试才能找到最优的分箱个数。
阅读全文