def median_target(var): temp = data[data[var].notnull()] temp = temp[[var, 'Outcome']].groupby(['Outcome'])[[var]].median().reset_index() return temp data.loc[(data['Outcome'] == 0 ) & (data['Insulin'].isnull()), 'Insulin'] = 102.5 data.loc[(data['Outcome'] == 1 ) & (data['Insulin'].isnull()), 'Insulin'] = 169.5 data.loc[(data['Outcome'] == 0 ) & (data['Glucose'].isnull()), 'Glucose'] = 107 data.loc[(data['Outcome'] == 1 ) & (data['Glucose'].isnull()), 'Glucose'] = 1 data.loc[(data['Outcome'] == 0 ) & (data['SkinThickness'].isnull()), 'SkinThickness'] = 27 data.loc[(data['Outcome'] == 1 ) & (data['SkinThickness'].isnull()), 'SkinThickness'] = 32 data.loc[(data['Outcome'] == 0 ) & (data['BloodPressure'].isnull()), 'BloodPressure'] = 70 data.loc[(data['Outcome'] == 1 ) & (data['BloodPressure'].isnull()), 'BloodPressure'] = 74.5 data.loc[(data['Outcome'] == 0 ) & (data['BMI'].isnull()), 'BMI'] = 30.1 data.loc[(data['Outcome'] == 1 ) & (data['BMI'].isnull()), 'BMI'] = 34.3 target_col = ["Outcome"] cat_cols = data.nunique()[data.nunique() < 12].keys().tolist() cat_cols = [x for x in cat_cols ] #numerical columns num_cols = [x for x in data.columns if x not in cat_cols + target_col] #Binary columns with 2 values bin_cols = data.nunique()[data.nunique() == 2].keys().tolist() #Columns more than 2 values multi_cols = [i for i in cat_cols if i not in bin_cols] #Label encoding Binary columns le = LabelEncoder() for i in bin_cols : data[i] = le.fit_transform(data[i]) #Duplicating columns for multi value columns data = pd.get_dummies(data = data,columns = multi_cols ) #Scaling Numerical columns std = StandardScaler() scaled = std.fit_transform(data[num_cols]) scaled = pd.DataFrame(scaled,columns=num_cols) #dropping original values merging scaled values for numerical columns df_data_og = data.copy() data = data.drop(columns = num_cols,axis = 1) data = data.merge(scaled,left_index=True,right_index=True,how = "left") # Def X and Y X = data.drop('Outcome', axis=1) y = data['Outcome'] X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True, random_state=1) y_train = to_categorical(y_train) y_test = to_categorical(y_test)
时间: 2024-01-14 19:02:33 浏览: 68
这段代码看起来是在进行数据预处理,首先定义了一个函数`median_target`,用于计算每个类别中某个特征的中位数。然后根据不同的`Outcome`值,对缺失值进行填充。接着将数据集中的分类特征进行编码,将二元特征进行二元编码,将多元特征进行独热编码。最后,对数值特征进行标准化处理,并将处理后的数据集进行拆分为训练集和测试集。
相关问题
def median_target(var): temp = data[data[var].notnull()] temp = temp[[var, 'Outcome']].groupby(['Outcome'])[[var]].median().reset_index() return temp
这是一个 Python 函数,用于计算指定变量在不同分类情况下的中位数。
函数名为 median_target,接收一个参数 var,表示要计算中位数的变量名。
函数中,首先使用 data[data[var].notnull()] 过滤掉 var 变量为空的行,然后使用 [[var, 'Outcome']] 选择 var 变量和分类变量 Outcome 两列。接着使用 groupby(['Outcome'])[[var]].median().reset_index() 对数据进行分组计算,分组依据为 Outcome 变量,计算的统计量为 var 变量的中位数。最后返回计算结果。
该函数可能是用于数据分析和特征工程中,用于计算不同分类情况下某个变量的中位数,以帮助挖掘变量与分类变量之间的关系。
mport numpy as np def rts_smooth(data, window_size, smooth_factor): assert window_size % 2 == 1, "Window size must be odd" assert 0 <= smooth_factor <= 1, "Smooth factor must be between 0 and 1" half_window = (window_size - 1) // 2 data_length = len(data) smoothed_data = np.zeros(data_length) for i in range(half_window, data_length - half_window): window = data[i - half_window:i + half_window + 1] median = np.median(window) deviation = np.abs(window - median) threshold = smooth_factor * np.median(deviation) if np.abs(data[i] - median) > threshold: smoothed_data[i] = median else: smoothed_data[i] = data[i] # 处理首尾值 smoothed_data[:half_window] = data[:half_window] smoothed_data[data_length - half_window:] = data[data_length - half_window:] return smoothed_data 使用示例 data = [10, 15, 20, 12, 18, 22, 25, 16, 14, 23] window_size = 3 smooth_factor = 0.6 smoothed_data = rts_smooth(data, window_size, smooth_factor) print(smoothed_data)这段代码如果输入数据有很多维度怎么改
如果输入数据有多个维度,可以使用`np.apply_along_axis`函数来应用平滑函数到每个维度上。下面是修改后的代码示例:
```python
import numpy as np
def rts_smooth(data, window_size, smooth_factor):
assert window_size % 2 == 1, "Window size must be odd"
assert 0 <= smooth_factor <= 1, "Smooth factor must be between 0 and 1"
half_window = (window_size - 1) // 2
data_length = len(data)
smoothed_data = np.zeros_like(data)
def smooth_func(x):
for i in range(half_window, data_length - half_window):
window = x[i - half_window:i + half_window + 1]
median = np.median(window)
deviation = np.abs(window - median)
threshold = smooth_factor * np.median(deviation)
if np.abs(x[i] - median) > threshold:
smoothed_data[i] = median
else:
smoothed_data[i] = x[i]
smoothed_data[:half_window] = x[:half_window]
smoothed_data[data_length - half_window:] = x[data_length - half_window:]
return smoothed_data
smoothed_data = np.apply_along_axis(smooth_func, axis=0, arr=data)
return smoothed_data
# 使用示例
data = np.array([[10, 15, 20, 12, 18, 22, 25, 16, 14, 23],
[5, 8, 10, 12, 9, 7, 15, 20, 18, 17]])
window_size = 3
smooth_factor = 0.6
smoothed_data = rts_smooth(data, window_size, smooth_factor)
print(smoothed_data)
```
这样,`rts_smooth`函数将会对输入数据的每个维度进行平滑处理。输出结果将保持与输入数据相同的维度。