cols_to_norm = ["duration", "src_bytes", "dst_bytes", "wrong_fragment", "urgent", "hot", "num_failed_logins", "num_compromised", "num_root", "num_file_creations", "num_shells", "num_access_files", "count", "srv_count", "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count", "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate", "dst_host_rerror_rate", "dst_host_srv_rerror_rate"]
时间: 2023-06-26 10:03:15 浏览: 199
这段代码定义了一个列表 `cols_to_norm`,其中包含了需要标准化的特征列名。这些特征列包括了网络安全数据集中的各种特征,如持续时间(duration)、源字节数(src_bytes)、目的字节数(dst_bytes)、错误分段(wrong_fragment)、紧急情况(urgent)等等。这些列需要进行标准化处理,以便进行后续的机器学习模型训练。
相关问题
risk_factor_df= pd.read_csv("kag_risk_factors_cervical_cancer(1).csv") diagnoses_num_partner_compare_cols = ['Dx:Cancer', 'Dx:HPV', "Number_of_sexual_partners",] corr_matrix = risk_factor_df[diagnoses_num_partner_compare_cols].corr() print(corr_matrix) diagnoses_num_partner_heatmap = px.imshow(corr_matrix, aspect="auto", color_continuous_scale="gnbu", text_auto=True) diagnoses_num_partner_heatmap.show()用pyecharts绘图
以下是使用 Pyecharts 绘制相关系数热力图的示例代码:
```
import pandas as pd
from pyecharts import options as opts
from pyecharts.charts import HeatMap
# 读取数据
risk_factor_df = pd.read_csv("kag_risk_factors_cervical_cancer(1).csv")
# 挑选需要绘制相关系数热力图的列
diagnoses_num_partner_compare_cols = ['Dx:Cancer', 'Dx:HPV', "Number_of_sexual_partners"]
# 计算相关系数矩阵
corr_matrix = risk_factor_df[diagnoses_num_partner_compare_cols].corr()
# 转换成二维列表
corr_list = []
for i in range(corr_matrix.shape[0]):
for j in range(corr_matrix.shape[1]):
corr_list.append([i, j, corr_matrix.iloc[i, j]])
# 绘制热力图
heatmap = (
HeatMap()
.add_xaxis(diagnoses_num_partner_compare_cols)
.add_yaxis("", diagnoses_num_partner_compare_cols, corr_list)
.set_global_opts(
title_opts=opts.TitleOpts(title="Correlation Heatmap"),
visualmap_opts=opts.VisualMapOpts(
min_=-1, max_=1, is_piecewise=True, pos_top="10%", pos_right="5%"
),
tooltip_opts=opts.TooltipOpts(formatter='{c}'),
)
)
# 显示图表
heatmap.render_notebook()
```
其中,我们使用了 Pandas 计算相关系数矩阵,然后将矩阵转换为二维列表,传递给 Pyecharts 的 HeatMap 组件绘制热力图。注意,我们设置了热力图的颜色范围、提示框格式等参数,可以根据需要自行修改。
优化代码 def module_split(self, save_on=True): """ split module data :param save_on: :return: """ for ms in range(self.mod_num): m_sn = self.module_list[ms] module_path = os.path.join(self.result_path_down, m_sn) cols_obj = ChuNengPackMustCols(ms, self.mod_cell_num, self.mod_cell_num) # 传入当前的module序号(如0,1,2,3,4),电芯电压个数,温度NTC个数。 aim_cols = [i for i in cols_obj.total_cols if i in self.df.columns] print(m_sn, aim_cols) self.modules[m_sn] = rename_cols_normal(self.df.loc[:, aim_cols], ms, self.mod_cell_num) print("after change cols name:", ms, m_sn, self.modules[m_sn].columns.tolist()) self.modules[m_sn].dropna(axis=0, how='any', subset=['soc'], inplace=True) volt_col = [f'volt{i}' for i in range(self.mod_cell_num)] temp_col = [f'temp{i}' for i in range(self.mod_cell_num)] self.modules[m_sn].dropna(axis=0, how='any', subset=volt_col, inplace=True) self.modules[m_sn] = stat(self.modules[m_sn], volt_col, temp_col) self.modules[m_sn].reset_index(drop=True, inplace=True) print(self.modules[m_sn]['discharge_ah'].iloc[-1]) self.module_cap[m_sn] = [self.modules[m_sn]['discharge_ah'].iloc[-1], self.modules[m_sn]['charge_ah'].iloc[-1], self.modules[m_sn]['soh'].iloc[-1]] self.module_peaks[m_sn] = list(quick_report(self.modules[m_sn], module_path, f'quick_report_{m_sn[:8]}')) # check soc status mod_soc = self.modules[m_sn]['soc'] self.module_soc_sig[m_sn] = [np.nanmedian(mod_soc), np.max(mod_soc), np.min(mod_soc)] if save_on: single_variables_plot(mod_soc, module_path, f'{m_sn[:8]}_soc_distribution_box.png', 'box', 'SOC') single_variables_plot(mod_soc, module_path, f'{m_sn[:8]}_soc_distribution_violin.png', 'violin', 'SOC')
以下是代码优化后的版本:
```
def module_split(self, save_on=True):
"""
Split module data
:param save_on:
:return:
"""
for ms in range(self.mod_num):
m_sn = self.module_list[ms]
module_path = os.path.join(self.result_path_down, m_sn)
cols_obj = ChuNengPackMustCols(ms, self.mod_cell_num, self.mod_cell_num)
aim_cols = [i for i in cols_obj.total_cols if i in self.df.columns]
print(m_sn, aim_cols)
self.modules[m_sn] = rename_cols_normal(self.df.loc[:, aim_cols], ms, self.mod_cell_num)
print("after change cols name:", ms, m_sn, self.modules[m_sn].columns.tolist())
# Drop rows with NaN values in 'soc' and voltage columns
volt_col = [f'volt{i}' for i in range(self.mod_cell_num)]
temp_col = [f'temp{i}' for i in range(self.mod_cell_num)]
self.modules[m_sn].dropna(axis=0, how='any', subset=['soc'] + volt_col, inplace=True)
# Compute statistics on remaining data
self.modules[m_sn] = stat(self.modules[m_sn], volt_col, temp_col)
self.modules[m_sn].reset_index(drop=True, inplace=True)
print(self.modules[m_sn]['discharge_ah'].iloc[-1])
self.module_cap[m_sn] = [
self.modules[m_sn]['discharge_ah'].iloc[-1],
self.modules[m_sn]['charge_ah'].iloc[-1],
self.modules[m_sn]['soh'].iloc[-1]
]
self.module_peaks[m_sn] = list(quick_report(self.modules[m_sn], module_path, f'quick_report_{m_sn[:8]}'))
# Compute SOC statistics and save plots
mod_soc = self.modules[m_sn]['soc']
self.module_soc_sig[m_sn] = [
np.nanmedian(mod_soc),
np.max(mod_soc),
np.min(mod_soc)
]
if save_on:
for plot_type in ['box', 'violin']:
single_variables_plot(mod_soc, module_path, f'{m_sn[:8]}_soc_distribution_{plot_type}.png', plot_type, 'SOC')
```
优化后的代码主要做了以下几个方面的改进:
1. 将所有行删除操作合并为一步,同时删除了温度列中的 NaN 值;
2. 统计数据前,只保留了保留了 'soc' 和电压列的非 NaN 行;
3. 在保存 SOC 分布图像时,避免了重复的代码。
阅读全文