data = euro12.loc[euro12['Team'].isin(['England', 'Italy', 'Russia']), ['Team', 'Shooting Accuracy']] data
时间: 2023-10-06 07:06:50 浏览: 141
你好!根据你的代码,你想从名为'euro12'的数据框中选择'Team'和'Shooting Accuracy'列,并筛选出'Team'列中包含'England'、'Italy'和'Russia'的行。是这样吗?
如果是这样,你的代码是正确的。它使用了`loc`函数来根据条件选择指定的列和行。在这里,我们使用`isin`函数来检查'Team'列中是否包含'England'、'Italy'和'Russia',并将结果存储在名为'data'的新数据框中。
请注意,为了使代码能够正常运行,确保'euro12'是一个有效的数据框,并且包含'Team'和'Shooting Accuracy'列。
如果你有任何其他问题,请随时提问!
相关问题
data2 = xr.open_dataset(file_path)[target_var].loc['1962-01-01':'2014-12-12', :, :] data = data2.loc[data2.time.dt.month.isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])].resample(time='Y').mean(dim='time').values#.mean(dim=['time']).values data1 = data.flatten() # print(data1.shape) model[i] = data1 # print(model[i].shape) data_spring = data2.loc[data2.time.dt.month.isin([3, 4, 5])].resample(time='Y').mean(dim='time').values#.mean(dim=['time']).values data1_spring = data_spring.flatten() print(model_spring[i].shape) print(data1_spring.shape) model_spring[i] = data1_spring data_summer = data2.loc[data2.time.dt.month.isin([6, 7, 8])].resample(time='Y').mean(dim='time').values#.mean(dim=['time']).values data1_summer = data_summer.flatten() model_summer[i] = data1_summer data_autumn = data2.loc[data2.time.dt.month.isin([9, 10, 11])].resample(time='Y').mean(dim='time').values#.mean(dim=['time']).values data1_autumn = data_autumn.flatten() model_autumn[i] = data1_autumn data_winter = data2.loc[data2.time.dt.month.isin([12, 1, 2])].resample(time='Y').mean(dim='time').values#.mean(dim=['time']).values data1_winter = data_winter.flatten() model_winter[i] = data1_winter i = i + 1这样读取速度太慢了,可不可以帮我优化一下
可以考虑以下优化:
1. 读取数据时可以使用延迟加载的方式,即设置 `chunks` 参数,将数据分块读取,可以减少一次性加载数据占用的内存和读取时间。
2. 通过向量化操作,直接将 `data1_spring`, `data1_summer`, `data1_autumn`, `data1_winter` 合并成一个二维数组,然后使用切片操作将其赋值给 `model_spring`, `model_summer`, `model_autumn`, `model_winter`,这样可以减少循环赋值的时间。
下面是优化后的代码:
```
data2 = xr.open_dataset(file_path)[target_var].loc['1962-01-01':'2014-12-12', :, :]
data = data2.loc[data2.time.dt.month.isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])].resample(time='Y').mean(dim='time').chunk({'latitude': 50, 'longitude': 50, 'time': -1})
data_spring = data2.loc[data2.time.dt.month.isin([3, 4, 5])].resample(time='Y').mean(dim='time').chunk({'latitude': 50, 'longitude': 50, 'time': -1})
data_summer = data2.loc[data2.time.dt.month.isin([6, 7, 8])].resample(time='Y').mean(dim='time').chunk({'latitude': 50, 'longitude': 50, 'time': -1})
data_autumn = data2.loc[data2.time.dt.month.isin([9, 10, 11])].resample(time='Y').mean(dim='time').chunk({'latitude': 50, 'longitude': 50, 'time': -1})
data_winter = data2.loc[data2.time.dt.month.isin([12, 1, 2])].resample(time='Y').mean(dim='time').chunk({'latitude': 50, 'longitude': 50, 'time': -1})
model = np.zeros((len(files), data.size), dtype='float32')
model_spring = np.zeros((len(files), data_spring.size), dtype='float32')
model_summer = np.zeros((len(files), data_summer.size), dtype='float32')
model_autumn = np.zeros((len(files), data_autumn.size), dtype='float32')
model_winter = np.zeros((len(files), data_winter.size), dtype='float32')
i = 0
for d, s, su, a, w in zip(data, data_spring, data_summer, data_autumn, data_winter):
data1 = d.values.flatten()
data1_spring = s.values.flatten()
data1_summer = su.values.flatten()
data1_autumn = a.values.flatten()
data1_winter = w.values.flatten()
model[i] = data1
model_spring[i:i+len(data1_spring)] = data1_spring.reshape(-1, data_spring.shape[1])
model_summer[i:i+len(data1_summer)] = data1_summer.reshape(-1, data_summer.shape[1])
model_autumn[i:i+len(data1_autumn)] = data1_autumn.reshape(-1, data_autumn.shape[1])
model_winter[i:i+len(data1_winter)] = data1_winter.reshape(-1, data_winter.shape[1])
i += 1
```
将下列代码变为伪代码def median_target(var): temp = data[data[var].notnull()] temp = temp[[var, 'Outcome']].groupby(['Outcome'])[[var]].median().reset_index() return temp data.loc[(data['Outcome'] == 0 ) & (data['Insulin'].isnull()), 'Insulin'] = 102.5 data.loc[(data['Result'] == 1 ) & (data['Insulin'].isnull()), 'Insulin'] = 169.5 data.loc[(data['Result'] == 0 ) & (data['Glucose'].isnull()), 'Glucose'] = 107 data.loc[(data['Result'] == 1 ) & (data['Glucose'].isnull()), 'Glucose'] = 1 data.loc[(data['Result'] == 0 ) & (data['SkinThickness'].isnull()), 'SkinThickness'] = 27 data.loc[(data['Result'] == 1 ) & (data['SkinThickness'].isnull()), 'SkinThickness'] = 32 data.loc[(data['Result'] == 0 ) & (data['BloodPressure'].isnull()), 'BloodPressure'] = 70 data.loc[(data['Result'] == 1 ) & (data['BloodPressure'].isnull()), 'BloodPressure'] = 74.5 data.loc[(data['Result'] == 0 ) & (data['BMI'].isnull()), 'BMI'] = 30.1 data.loc[(data['Result'] == 1 ) & (data['BMI'].isnull()), 'BMI'] = 34.3 target_col = [“Outcome”] cat_cols = data.nunique()[data.nunique() < 12].keys().tolist() cat_cols = [x for x in cat_cols ] #numerical列 num_cols = [x for x in data.columns if x 不在 cat_cols + target_col] #Binary列有 2 个值 bin_cols = data.nunique()[data.nunique() == 2].keys().tolist() #Columns 2 个以上的值 multi_cols = [i 表示 i in cat_cols if i in bin_cols] #Label编码二进制列 le = LabelEncoder() for i in bin_cols : data[i] = le.fit_transform(data[i]) #Duplicating列用于多值列 data = pd.get_dummies(data = data,columns = multi_cols ) #Scaling 数字列 std = StandardScaler() 缩放 = std.fit_transform(数据[num_cols]) 缩放 = pd。数据帧(缩放,列=num_cols) #dropping原始值合并数字列的缩放值 df_data_og = 数据.copy() 数据 = 数据.drop(列 = num_cols,轴 = 1) 数据 = 数据.合并(缩放,left_index=真,right_index=真,如何 = “左”) # 定义 X 和 Y X = 数据.drop('结果', 轴=1) y = 数据['结果'] X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True, random_state=1) y_train = to_categorical(y_train) y_test = to_categorical(y_test)
function median_target(var) {
temp = data[data[var].notnull()];
temp = temp[[var, 'Outcome']].groupby(['Outcome'])[[var]].median().reset_index();
return temp;
}
data.loc[(data['Outcome'] == 0) & (data['Insulin'].isnull()), 'Insulin'] = 102.5;
data.loc[(data['Outcome'] == 1) & (data['Insulin'].isnull()), 'Insulin'] = 169.5;
data.loc[(data['Outcome'] == 0) & (data['Glucose'].isnull()), 'Glucose'] = 107;
data.loc[(data['Outcome'] == 1) & (data['Glucose'].isnull()), 'Glucose'] = 1;
data.loc[(data['Outcome'] == 0) & (data['SkinThickness'].isnull()), 'SkinThickness'] = 27;
data.loc[(data['Outcome'] == 1) & (data['SkinThickness'].isnull()), 'SkinThickness'] = 32;
data.loc[(data['Outcome'] == 0) & (data['BloodPressure'].isnull()), 'BloodPressure'] = 70;
data.loc[(data['Outcome'] == 1) & (data['BloodPressure'].isnull()), 'BloodPressure'] = 74.5;
data.loc[(data['Outcome'] == 0) & (data['BMI'].isnull()), 'BMI'] = 30.1;
data.loc[(data['Outcome'] == 1) & (data['BMI'].isnull()), 'BMI'] = 34.3;
target_col = ["Outcome"];
cat_cols = data.nunique()[data.nunique() < 12].keys().tolist();
cat_cols = [x for x in cat_cols];
num_cols = [x for x in data.columns if x not in cat_cols + target_col];
bin_cols = data.nunique()[data.nunique() == 2].keys().tolist();
multi_cols = [i for i in cat_cols if i in bin_cols];
le = LabelEncoder();
for i in bin_cols:
data[i] = le.fit_transform(data[i]);
data = pd.get_dummies(data=data, columns=multi_cols);
std = StandardScaler();
scaled = std.fit_transform(data[num_cols]);
scaled = pd.DataFrame(scaled, columns=num_cols);
df_data_og = data.copy();
data = data.drop(columns=num_cols, axis=1);
data = data.merge(scaled, left_index=True, right_index=True, how='left');
X = data.drop('Outcome', axis=1);
y = data['Outcome'];
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True, random_state=1);
y_train = to_categorical(y_train);
y_test = to_categorical(y_test);