改进代码,对'dti', 'installment', 'employmentTitle', 'interestRate', 'annualIncome', 'revolBal', 'revolUtil', 'Creditage', 'loanAmnt','avg_income','avg_loanAmnt','mean_interestRate','rest_money','ficoRange_mean','rest_Revol'对分箱
时间: 2023-06-20 13:10:34 浏览: 92
以下是对这些特征进行分箱的代码:
```python
import pandas as pd
import numpy as np
from scipy import stats
def binning(col, cut_points, labels=None):
# Define min and max values:
minval = col.min()
maxval = col.max()
# create list by adding min and max values to cut_points
break_points = [minval] + cut_points + [maxval]
# if no labels provided, use default labels 0 ... (n-1)
if not labels:
labels = range(len(cut_points)+1)
# Binning using cut function of pandas
colBin = pd.cut(col, bins=break_points, labels=labels, include_lowest=True)
return colBin
def apply_binning(data):
# create new dataframe to store binned features
binned_data = pd.DataFrame(index=data.index)
# list of features to be binned
features_to_bin = ['dti', 'installment', 'interestRate', 'annualIncome', 'revolBal', 'revolUtil', 'Creditage', 'loanAmnt']
# define cut points for each feature and apply binning function
for feature in features_to_bin:
if feature == 'dti':
cut_points = [-1, 10, 20, 30, 40, 50, np.inf]
elif feature == 'installment':
cut_points = [-1, 200, 400, 600, 800, 1000, 1200, 1400, np.inf]
elif feature == 'interestRate':
cut_points = [-1, 5, 10, 15, 20, 25, 30, np.inf]
elif feature == 'annualIncome':
cut_points = [-1, 20000, 40000, 60000, 80000, 100000, 120000, 140000, 160000, 180000, 200000, np.inf]
elif feature == 'revolBal':
cut_points = [-1, 5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, np.inf]
elif feature == 'revolUtil':
cut_points = [-1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, np.inf]
elif feature == 'Creditage':
cut_points = [-1, 10, 20, 30, 40, 50, 60, 70, 80, np.inf]
elif feature == 'loanAmnt':
cut_points = [-1, 5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, np.inf]
binned_data[feature] = binning(data[feature], cut_points)
# create new binned features by calculating average values
binned_data['avg_income'] = pd.cut(data['annualIncome'], bins=[-1, 40000, 80000, 120000, 160000, np.inf], labels=[1, 2, 3, 4, 5], include_lowest=True)
binned_data['avg_loanAmnt'] = pd.cut(data['loanAmnt'], bins=[-1, 10000, 20000, 30000, 40000, 50000, np.inf], labels=[1, 2, 3, 4, 5, 6], include_lowest=True)
binned_data['mean_interestRate'] = pd.cut(data['interestRate'], bins=[-1, 10, 15, 20, 25, 30], labels=[1, 2, 3, 4, 5], include_lowest=True)
# create new binned feature by calculating remaining money
binned_data['rest_money'] = pd.cut(data['annualIncome'] - data['loanAmnt'], bins=[-1, 0, 10000, 20000, 30000, 40000, 50000, np.inf], labels=[1, 2, 3, 4, 5, 6, 7], include_lowest=True)
# create new binned feature by calculating mean FICO score
binned_data['ficoRange_mean'] = data['ficoRangeHigh'] - data['ficoRangeLow']
# create new binned feature by calculating remaining revolving balance
binned_data['rest_Revol'] = pd.cut(data['revolBal'] - data['loanAmnt'], bins=[-1, 0, 5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, np.inf], labels=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], include_lowest=True)
return binned_data
```
阅读全文