def cipin(data_qustop, num=10): temp = [' '.join(x) for x in data_qustop] temp1 = ' '.join(temp) temp2 = pd.Series(temp1.split()).value_counts() return temp2[temp2 > num]
时间: 2024-04-28 19:19:09 浏览: 72
这是一个函数定义,该函数的输入参数是一个 DataFrame 对象 data_qustop 和一个默认值为 10 的整数 num。该函数的作用是统计 data_qustop 中所有文本数据中出现次数大于 num 的单词,并将结果以 Pandas Series 对象的形式返回。
具体来说,该函数首先将 data_qustop 中的所有文本数据转换为一个字符串列表,然后将该列表转换为一个以空格分隔的字符串。接着,该函数使用 Pandas 库中的 Series 函数将该字符串拆分为单词,并统计每个单词在字符串中出现的次数。最后,该函数将结果存储为一个 Pandas Series 对象,并返回其中出现次数大于 num 的单词。
相关问题
import os import random import numpy as np import cv2 import keras from create_unet import create_model img_path = 'data_enh/img' mask_path = 'data_enh/mask' # 训练集与测试集的切分 img_files = np.array(os.listdir(img_path)) data_num = len(img_files) train_num = int(data_num * 0.8) train_ind = random.sample(range(data_num), train_num) test_ind = list(set(range(data_num)) - set(train_ind)) train_ind = np.array(train_ind) test_ind = np.array(test_ind) train_img = img_files[train_ind] # 训练的数据 test_img = img_files[test_ind] # 测试的数据 def get_mask_name(img_name): mask = [] for i in img_name: mask_name = i.replace('.jpg', '.png') mask.append(mask_name) return np.array(mask) train_mask = get_mask_name(train_img) test_msak = get_mask_name(test_img) def generator(img, mask, batch_size): num = len(img) while True: IMG = [] MASK = [] for i in range(batch_size): index = np.random.choice(num) img_name = img[index] mask_name = mask[index] img_temp = os.path.join(img_path, img_name) mask_temp = os.path.join(mask_path, mask_name) temp_img = cv2.imread(img_temp) temp_mask = cv2.imread(mask_temp, 0)/255 temp_mask = np.reshape(temp_mask, [256, 256, 1]) IMG.append(temp_img) MASK.append(temp_mask) IMG = np.array(IMG) MASK = np.array(MASK) yield IMG, MASK # train_data = generator(train_img, train_mask, 32) # temp_data = train_data.__next__() # 计算dice系数 def dice_coef(y_true, y_pred): y_true_f = keras.backend.flatten(y_true) y_pred_f = keras.backend.flatten(y_pred) intersection = keras.backend.sum(y_true_f * y_pred_f) area_true = keras.backend.sum(y_true_f * y_true_f) area_pred = keras.backend.sum(y_pred_f * y_pred_f) dice = (2 * intersection + 1)/(area_true + area_pred + 1) return dice # 自定义损失函数,dice_loss def dice_coef_loss(y_true, y_pred): return 1 - dice_coef(y_true, y_pred) # 模型的创建 model = create_model() # 模型的编译 model.compile(optimizer='Adam', loss=dice_coef_loss, metrics=[dice_coef]) # 模型的训练 history = model.fit_generator(generator(train_img, train_mask, 4), steps_per_epoch=100, epochs=10, validation_data=generator(test_img, test_msak, 4), validation_steps=4 ) # 模型的保存 model.save('unet_model.h5') # 模型的读取 model = keras.models.load_model('unet_model.h5', custom_objects={'dice_coef_loss': dice_coef_loss, 'dice_coef': dice_coef}) # 获取测试数据 test_generator = generator(test_img, test_msak, 32) img, mask = test_generator.__next__() # 模型的测试 model.evaluate(img, mask) # [0.11458712816238403, 0.885412871837616] 94%
上面这段代码是在导入一些库。它导入了 OS 库,Random 库,NumPy 库,CV2 库,Keras 库,以及一个叫做 Create_unet 的自定义模块。它还定义了两个字符串变量:img_path 和 mask_path,分别存储了图像数据和掩码数据的路径。
优化代码 def module_split(self, save_on=True): """ split module data :param save_on: :return: """ for ms in range(self.mod_num): m_sn = self.module_list[ms] module_path = os.path.join(self.result_path_down, m_sn) cols_obj = ChuNengPackMustCols(ms, self.mod_cell_num, self.mod_cell_num) # 传入当前的module序号(如0,1,2,3,4),电芯电压个数,温度NTC个数。 aim_cols = [i for i in cols_obj.total_cols if i in self.df.columns] print(m_sn, aim_cols) self.modules[m_sn] = rename_cols_normal(self.df.loc[:, aim_cols], ms, self.mod_cell_num) print("after change cols name:", ms, m_sn, self.modules[m_sn].columns.tolist()) self.modules[m_sn].dropna(axis=0, how='any', subset=['soc'], inplace=True) volt_col = [f'volt{i}' for i in range(self.mod_cell_num)] temp_col = [f'temp{i}' for i in range(self.mod_cell_num)] self.modules[m_sn].dropna(axis=0, how='any', subset=volt_col, inplace=True) self.modules[m_sn] = stat(self.modules[m_sn], volt_col, temp_col) self.modules[m_sn].reset_index(drop=True, inplace=True) print(self.modules[m_sn]['discharge_ah'].iloc[-1]) self.module_cap[m_sn] = [self.modules[m_sn]['discharge_ah'].iloc[-1], self.modules[m_sn]['charge_ah'].iloc[-1], self.modules[m_sn]['soh'].iloc[-1]] self.module_peaks[m_sn] = list(quick_report(self.modules[m_sn], module_path, f'quick_report_{m_sn[:8]}')) # check soc status mod_soc = self.modules[m_sn]['soc'] self.module_soc_sig[m_sn] = [np.nanmedian(mod_soc), np.max(mod_soc), np.min(mod_soc)] if save_on: single_variables_plot(mod_soc, module_path, f'{m_sn[:8]}_soc_distribution_box.png', 'box', 'SOC') single_variables_plot(mod_soc, module_path, f'{m_sn[:8]}_soc_distribution_violin.png', 'violin', 'SOC')
以下是代码优化后的版本:
```
def module_split(self, save_on=True):
"""
Split module data
:param save_on:
:return:
"""
for ms in range(self.mod_num):
m_sn = self.module_list[ms]
module_path = os.path.join(self.result_path_down, m_sn)
cols_obj = ChuNengPackMustCols(ms, self.mod_cell_num, self.mod_cell_num)
aim_cols = [i for i in cols_obj.total_cols if i in self.df.columns]
print(m_sn, aim_cols)
self.modules[m_sn] = rename_cols_normal(self.df.loc[:, aim_cols], ms, self.mod_cell_num)
print("after change cols name:", ms, m_sn, self.modules[m_sn].columns.tolist())
# Drop rows with NaN values in 'soc' and voltage columns
volt_col = [f'volt{i}' for i in range(self.mod_cell_num)]
temp_col = [f'temp{i}' for i in range(self.mod_cell_num)]
self.modules[m_sn].dropna(axis=0, how='any', subset=['soc'] + volt_col, inplace=True)
# Compute statistics on remaining data
self.modules[m_sn] = stat(self.modules[m_sn], volt_col, temp_col)
self.modules[m_sn].reset_index(drop=True, inplace=True)
print(self.modules[m_sn]['discharge_ah'].iloc[-1])
self.module_cap[m_sn] = [
self.modules[m_sn]['discharge_ah'].iloc[-1],
self.modules[m_sn]['charge_ah'].iloc[-1],
self.modules[m_sn]['soh'].iloc[-1]
]
self.module_peaks[m_sn] = list(quick_report(self.modules[m_sn], module_path, f'quick_report_{m_sn[:8]}'))
# Compute SOC statistics and save plots
mod_soc = self.modules[m_sn]['soc']
self.module_soc_sig[m_sn] = [
np.nanmedian(mod_soc),
np.max(mod_soc),
np.min(mod_soc)
]
if save_on:
for plot_type in ['box', 'violin']:
single_variables_plot(mod_soc, module_path, f'{m_sn[:8]}_soc_distribution_{plot_type}.png', plot_type, 'SOC')
```
优化后的代码主要做了以下几个方面的改进:
1. 将所有行删除操作合并为一步,同时删除了温度列中的 NaN 值;
2. 统计数据前,只保留了保留了 'soc' 和电压列的非 NaN 行;
3. 在保存 SOC 分布图像时,避免了重复的代码。
阅读全文