data[cols] = ss.fit_transform(data[cols])
时间: 2023-06-17 20:04:01 浏览: 53
这行代码是用 sklearn 库中的 StandardScaler 类对 data 中的 cols 列进行标准化处理,并将处理后的结果赋值回 data 中的相应列。具体来说,StandardScaler 类会计算出 cols 列的均值和标准差,然后对每个元素进行如下转换:
(x - mean) / std
其中 x 是该元素的原始值,mean 和 std 分别是该列的均值和标准差。这个转换可以将该列的值转化为均值为 0,标准差为 1 的形式,从而方便进行后续的数据处理和分析。
相关问题
将这些代码转换为伪代码 # 确定目标变量和特征变量 target_col = ["Outcome"] cat_cols = data.nunique()[data.nunique() < 12].keys().tolist() cat_cols = [x for x in cat_cols] # numerical columns num_cols = [x for x in data.columns if x not in cat_cols + target_col] # Binary columns with 2 values bin_cols = data.nunique()[data.nunique() == 2].keys().tolist() # Columns more than 2 values multi_cols = [i for i in cat_cols if i not in bin_cols] # Label encoding Binary columns le = LabelEncoder() for i in bin_cols: data[i] = le.fit_transform(data[i]) # Duplicating columns for multi value columns data = pd.get_dummies(data=data, columns=multi_cols) # Scaling Numerical columns std = StandardScaler() scaled = std.fit_transform(data[num_cols]) scaled = pd.DataFrame(scaled, columns=num_cols) # dropping original values merging scaled values for numerical columns df_data_og = data.copy() data = data.drop(columns=num_cols, axis=1) data = data.merge(scaled, left_index=True, right_index=True, how="left") # 输出预处理后的数据集 print(data.head())
以下是转换为伪代码后的代码:
```
# 确定目标变量和特征变量
target_col = ["Outcome"]
cat_cols = 对数据集进行唯一值计数并筛选小于12的特征
cat_cols = [x for x in cat_cols]
# 数值型特征
num_cols = [x for x in 数据集的列名 if x not in cat_cols + target_col]
# 二分类特征
bin_cols = 对数据集进行唯一值计数并筛选等于2的特征
# 多分类特征
multi_cols = [i for i in cat_cols if i not in bin_cols]
# Label encoding 二分类特征
le = LabelEncoder()
for i in bin_cols:
对二分类特征进行标签编码
# 独热编码 多分类特征
data = pd.get_dummies(data=data, columns=multi_cols)
# 标准化 数值型特征
std = StandardScaler()
scaled = std.fit_transform(data[num_cols])
scaled = pd.DataFrame(scaled, columns=num_cols)
# 合并特征
df_data_og = data.copy()
data = data.drop(columns=num_cols, axis=1)
data = data.merge(scaled, left_index=True, right_index=True, how="left")
# 输出预处理后的数据集
print(data.head())
```
以上伪代码是对原 Python 代码的简化和抽象,将其转化为了一系列的操作和方法调用。
function median_target(var) { temp = data[data[var].notnull()]; temp = temp[[var, 'Outcome']].groupby(['Outcome'])[[var]].median().reset_index(); return temp; } data.loc[(data['Outcome'] == 0) & (data['Insulin'].isnull()), 'Insulin'] = 102.5; data.loc[(data['Outcome'] == 1) & (data['Insulin'].isnull()), 'Insulin'] = 169.5; data.loc[(data['Outcome'] == 0) & (data['Glucose'].isnull()), 'Glucose'] = 107; data.loc[(data['Outcome'] == 1) & (data['Glucose'].isnull()), 'Glucose'] = 1; data.loc[(data['Outcome'] == 0) & (data['SkinThickness'].isnull()), 'SkinThickness'] = 27; data.loc[(data['Outcome'] == 1) & (data['SkinThickness'].isnull()), 'SkinThickness'] = 32; data.loc[(data['Outcome'] == 0) & (data['BloodPressure'].isnull()), 'BloodPressure'] = 70; data.loc[(data['Outcome'] == 1) & (data['BloodPressure'].isnull()), 'BloodPressure'] = 74.5; data.loc[(data['Outcome'] == 0) & (data['BMI'].isnull()), 'BMI'] = 30.1; data.loc[(data['Outcome'] == 1) & (data['BMI'].isnull()), 'BMI'] = 34.3; target_col = ["Outcome"]; cat_cols = data.nunique()[data.nunique() < 12].keys().tolist(); cat_cols = [x for x in cat_cols]; num_cols = [x for x in data.columns if x not in cat_cols + target_col]; bin_cols = data.nunique()[data.nunique() == 2].keys().tolist(); multi_cols = [i for i in cat_cols if i in bin_cols]; le = LabelEncoder(); for i in bin_cols: data[i] = le.fit_transform(data[i]); data = pd.get_dummies(data=data, columns=multi_cols); std = StandardScaler(); scaled = std.fit_transform(data[num_cols]); scaled = pd.DataFrame(scaled, columns=num_cols); df_data_og = data.copy(); data = data.drop(columns=num_cols, axis=1); data = data.merge(scaled, left_index=True, right_index=True, how='left'); X = data.drop('Outcome', axis=1); y = data['Outcome']; X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True, random_state=1); y_train = to_categorical(y_train); y_test = to_categorical(y_test);将这段代码添加注释
# 导入必要的库
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
# 定义函数,返回每个特征在不同结果下的中位数
def median_target(var):
temp = data[data[var].notnull()]
temp = temp[[var, 'Outcome']].groupby(['Outcome'])[[var]].median().reset_index()
return temp
# 将缺失值填充为中位数
data.loc[(data['Outcome'] == 0) & (data['Insulin'].isnull()), 'Insulin'] = 102.5
data.loc[(data['Outcome'] == 1) & (data['Insulin'].isnull()), 'Insulin'] = 169.5
data.loc[(data['Outcome'] == 0) & (data['Glucose'].isnull()), 'Glucose'] = 107
data.loc[(data['Outcome'] == 1) & (data['Glucose'].isnull()), 'Glucose'] = 1
data.loc[(data['Outcome'] == 0) & (data['SkinThickness'].isnull()), 'SkinThickness'] = 27
data.loc[(data['Outcome'] == 1) & (data['SkinThickness'].isnull()), 'SkinThickness'] = 32
data.loc[(data['Outcome'] == 0) & (data['BloodPressure'].isnull()), 'BloodPressure'] = 70
data.loc[(data['Outcome'] == 1) & (data['BloodPressure'].isnull()), 'BloodPressure'] = 74.5
data.loc[(data['Outcome'] == 0) & (data['BMI'].isnull()), 'BMI'] = 30.1
data.loc[(data['Outcome'] == 1) & (data['BMI'].isnull()), 'BMI'] = 34.3
# 将数据进行分类处理
target_col = ["Outcome"]
cat_cols = data.nunique()[data.nunique() < 12].keys().tolist()
cat_cols = [x for x in cat_cols]
num_cols = [x for x in data.columns if x not in cat_cols + target_col]
bin_cols = data.nunique()[data.nunique() == 2].keys().tolist()
multi_cols = [i for i in cat_cols if i in bin_cols]
# 对二分类特征进行编码
le = LabelEncoder()
for i in bin_cols:
data[i] = le.fit_transform(data[i])
# 将分类特征进行独热编码
data = pd.get_dummies(data=data, columns=multi_cols)
# 对数值特征进行标准化
std = StandardScaler()
scaled = std.fit_transform(data[num_cols])
scaled = pd.DataFrame(scaled, columns=num_cols)
# 将数据进行合并
df_data_og = data.copy()
data = data.drop(columns=num_cols, axis=1)
data = data.merge(scaled, left_index=True, right_index=True, how='left')
# 划分训练集和测试集,对标签进行独热编码
X = data.drop('Outcome', axis=1)
y = data['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True, random_state=1)
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)