# SimpleImputer deal with missing value use mean imp = SimpleImputer(missing_values=np.nan, strategy='mean') df_train_col = df_train.columns df_test_col = df_test.columns df_train = imp.fit_transform(df_train) df_test = imp.fit_transform(df_test) df_train = pd.DataFrame(df_train, columns = df_train_col) df_test = pd.DataFrame(df_test, columns = df_test_col)
时间: 2023-10-20 22:53:39 浏览: 169
这段代码使用了Scikit-learn库中的SimpleImputer类来处理缺失值。首先,创建了一个SimpleImputer对象,指定了缺失值的标识为np.nan,替换策略为均值(strategy='mean')。接着,记录了数据集中的列名,以便后续恢复DataFrame格式。然后,对训练集和测试集分别进行均值填充处理,并将结果转换回DataFrame格式,并使用之前记录的列名进行列索引的恢复。这样,就可以在处理缺失值的同时保留数据集的完整性。
相关问题
优化这个sql SELECT count( 1 ) FROM ( SELECT B.ID, B.PURCHASE_REQUEST_ID, B.MATERIAL_ID, B.MATERIAL_CODE, B.MATERIAL_NAME, B.STANDARD, B.MODEL_ID, B.BILL_ROW_ID, B.BILL_NO, BILL_NAME, B.MODEL_CODE, B.MODEL_NAME, B.PARENT_MODEL_ID, B.PARENT_MODEL_CODE, B.PARENT_MODEL_NAME, B.UNIT_CODE, B.UNIT_NAME, B.PURCHASE_TYPE_CODE, CAST( NVL( B.APPLY_NUM, 0 ) AS NUMBER ( 24, 10 ) ) AS APPLY_NUM, CAST( NVL( B.DEAL_NUM, 0 ) AS NUMBER ( 24, 10 ) ) AS DEAL_NUM, CAST( NVL( B.RETURN_NUM, 0 ) AS NUMBER ( 24, 10 ) ) AS RETURN_NUM, B.DEAL_USER_ID, B.DEAL_USER_NAME, CAST( NVL( B.PRICE, 0 ) AS NUMBER ( 24, 10 ) ) AS PRICE, CAST( NVL( B.AMOUNT, 0 ) AS NUMBER ( 24, 10 ) ) AMOUNT, B.IMPLEMENT_CODE, B.IMPLEMENT_NAME, B.IMPLEMENT_INVEST_AMOUNT, B.PURCHASE_MANAGER_ID, B.PURCHASE_MANAGER_NAME, B.PROVIDER_ID, B.PROVIDER_NAME, B.REMARK, B.DELIVER_AREA, B.DELIVER_ADDRESS, B.RECEIVE_PEOPLE, B.RECEIVE_PEOPLE_PHONE, B.ITEM_STATUS, B.COST_CENTER, B.COST_BUDGET_CODE, B.COST_IMPLEMENT_NAME, B.FRAME_CONT_ID, B.FRAME_CONT_CODE, B.FRAME_CONT_NAME, B.DETAIL_CONFIG, B.PURCHASE_CATEGORY_CODE, B.INVOICE_TITLE_CODE, B.INVOICE_SEND_ADDRRSS, B.MATERIAL_REQUEST_ITEM_ID, B.YEAR, B.DELETE_FLAG, B.PROVINCE_CODE, B.REASON, B.PARENT_ITEM_ID, B.FRAME_CONT_ITEM_ID, B.SUB_MATERIAL_REQUEST_ID, B.SUB_MATERIAL_REQUEST_CODE, B.MATERIAL_URL, B.RECOMMEND_PROVIDER_NAMES, C.PURCHASE_REQUEST_CODE, C.PURCHASE_REQUEST_NAME, C.APPLY_TYPE_CODE, C.CREATOR_NAME, C.APPLY_TELEPHONE, C.COMPANY_NAME, C.DEPT_NAME, B.CREATE_TIME, TO_CHAR( B.CREATE_TIME, 'YYYY-MM-DD' ) CREATE_TIME_STR, C.ARRIVE_TIME, C.IS_TO_END, C.MONEY_WAY_CODE, C.OWN, C.APPLY_CATEGORY_CODE, C.manu_Type, C.BILL_ID, MMD.MATERIAL_TYPE_CODE, B.BRANCH_COMPANY_DEAL_USER_ID, B.BRANCH_COMPANY_DEAL_USER_NAME, ( SELECT ORG_NAME FROM ORGANIZATIONS WHERE DELETE_FLAG = '0' AND ORG_CODE = ( SELECT PARENT_COMPANY_NO FROM ORGANIZATIONS WHERE ID = B.MATERIAL_DEPT_ID )) AS MATERIAL_COMPANY_NAME, B.ORIGINAL, B.PROVIDER_PRODUCT_MODEL, B.PROVIDER_PRODUCT_NAME, B.PRODUCT_DESC, B.Back_Flag, CASE WHEN MMD.material_type_code = 'WZ' THEN '1' WHEN MMD.material_type_code = 'FW' THEN '2' ELSE '3' END apply_category_code_item, NVL( C.IS_CARDSYSTEM_REQUEST, '0' ) IS_CARDSYSTEM_REQUEST, B.APPLY_GROUP_AUTHORITES, B.SCIENTIFIC_RESEARCH_ID, B.SCIENTIFIC_RESEARCH_CODE, B.SCIENTIFIC_RESEARCH_NAME, B.PREQUALFY_CODE, nvl( C.IS_QUICK, '0' ) AS IS_QUICK, C.PURCHASE_WAY_CODE, C.PURCHASE_TYPE_CODE PURCHASE_TYPE_CODE_P, C.ORIGINAL_TYPE, C.PURCHASE_REQUEST_BILLS_TYPE, B.IS_FRAME_CONT_MONAD FROM PURCHASE_REQUEST_ITEM B LEFT JOIN PURCHASE_REQUEST C ON B.PURCHASE_REQUEST_ID = C.ID LEFT JOIN MATERIAL_DATA MMD ON MMD.ID = B.MATERIAL_ID AND MMD.DELETE_FLAG = '0' WHERE B.delete_flag = '0' AND B.Item_Status IN ( 1 ) AND NOT EXISTS ( SELECT * FROM purchase_request_item_log pril WHERE B.id = pril.purchase_request_item_id AND pril.lock_status = '1' AND pril.delete_flag = '0' ) AND ( ( c.apply_type_code NOT IN ( '20', '41', '3' ) AND nvl( B.Apply_Num, 0 ) > nvl( B.Deal_Num, 0 )) OR c.apply_type_code IN ( '20', '41', '3' ) ) AND B.Deal_User_Id =: 1 AND C.MONEY_WAY_CODE =: 2 AND C.APPLY_TYPE_CODE =: 3 AND C.PAY_OUT_TYPE_CODE =: 4 AND C.APPLY_CATEGORY_CODE =: 5 AND NVL( C.IS_CARDSYSTEM_REQUEST, '0' ) = : 6 AND NOT EXISTS ( SELECT * FROM purchase_request_item p left join material_province mp ON p.material_id = mp.material_id WHERE p.delete_flag = 0 AND mp.delete_flag = 0 AND mp.material_status = 03 AND mp.org_code = p.province_code AND p.id = B.id ) ORDER BY C.ID, B.ID ASC)
首先,可以将子查询中的所有列名改为别名,这样可以避免重名和提高代码可读性。另外,可以尝试使用 EXISTS 子查询来替换 NOT EXISTS 子查询,因为 EXISTS 子查询的性能通常更好。最后,可以考虑对 WHERE 子句中的多个条件进行逻辑上的分组,以避免出现过于复杂的条件表达式。下面是优化后的 SQL:
SELECT COUNT(1)
FROM (
SELECT
b.ID,
b.PURCHASE_REQUEST_ID,
b.MATERIAL_ID,
b.MATERIAL_CODE,
b.MATERIAL_NAME,
b.STANDARD,
b.MODEL_ID,
b.BILL_ROW_ID,
b.BILL_NO,
b.BILL_NAME,
b.MODEL_CODE,
b.MODEL_NAME,
b.PARENT_MODEL_ID,
b.PARENT_MODEL_CODE,
b.PARENT_MODEL_NAME,
b.UNIT_CODE,
b.UNIT_NAME,
b.PURCHASE_TYPE_CODE,
CAST(NVL(b.APPLY_NUM, 0) AS NUMBER(24, 10)) AS APPLY_NUM,
CAST(NVL(b.DEAL_NUM, 0) AS NUMBER(24, 10)) AS DEAL_NUM,
CAST(NVL(b.RETURN_NUM, 0) AS NUMBER(24, 10)) AS RETURN_NUM,
b.DEAL_USER_ID,
b.DEAL_USER_NAME,
CAST(NVL(b.PRICE, 0) AS NUMBER(24, 10)) AS PRICE,
CAST(NVL(b.AMOUNT, 0) AS NUMBER(24, 10)) AMOUNT,
b.IMPLEMENT_CODE,
b.IMPLEMENT_NAME,
b.IMPLEMENT_INVEST_AMOUNT,
b.PURCHASE_MANAGER_ID,
b.PURCHASE_MANAGER_NAME,
b.PROVIDER_ID,
b.PROVIDER_NAME,
b.REMARK,
b.DELIVER_AREA,
b.DELIVER_ADDRESS,
b.RECEIVE_PEOPLE,
b.RECEIVE_PEOPLE_PHONE,
b.ITEM_STATUS,
b.COST_CENTER,
b.COST_BUDGET_CODE,
b.COST_IMPLEMENT_NAME,
b.FRAME_CONT_ID,
b.FRAME_CONT_CODE,
b.FRAME_CONT_NAME,
b.DETAIL_CONFIG,
b.PURCHASE_CATEGORY_CODE,
b.INVOICE_TITLE_CODE,
b.INVOICE_SEND_ADDRRSS,
b.MATERIAL_REQUEST_ITEM_ID,
b.YEAR,
b.DELETE_FLAG,
b.PROVINCE_CODE,
b.REASON,
b.PARENT_ITEM_ID,
b.FRAME_CONT_ITEM_ID,
b.SUB_MATERIAL_REQUEST_ID,
b.SUB_MATERIAL_REQUEST_CODE,
b.MATERIAL_URL,
b.RECOMMEND_PROVIDER_NAMES,
c.PURCHASE_REQUEST_CODE,
c.PURCHASE_REQUEST_NAME,
c.APPLY_TYPE_CODE,
c.CREATOR_NAME,
c.APPLY_TELEPHONE,
c.COMPANY_NAME,
c.DEPT_NAME,
b.CREATE_TIME,
TO_CHAR(b.CREATE_TIME, 'YYYY-MM-DD') CREATE_TIME_STR,
c.ARRIVE_TIME,
c.IS_TO_END,
c.MONEY_WAY_CODE,
c.OWN,
c.APPLY_CATEGORY_CODE,
c.manu_Type,
c.BILL_ID,
MMD.MATERIAL_TYPE_CODE,
b.BRANCH_COMPANY_DEAL_USER_ID,
b.BRANCH_COMPANY_DEAL_USER_NAME,
(SELECT ORG_NAME FROM ORGANIZATIONS WHERE DELETE_FLAG = '0' AND ORG_CODE = (SELECT PARENT_COMPANY_NO FROM ORGANIZATIONS WHERE ID = b.MATERIAL_DEPT_ID)) AS MATERIAL_COMPANY_NAME,
b.ORIGINAL,
b.PROVIDER_PRODUCT_MODEL,
b.PROVIDER_PRODUCT_NAME,
b.PRODUCT_DESC,
b.Back_Flag,
CASE
WHEN MMD.material_type_code = 'WZ' THEN '1'
WHEN MMD.material_type_code = 'FW' THEN '2'
ELSE '3'
END apply_category_code_item,
NVL(c.IS_CARDSYSTEM_REQUEST, '0') IS_CARDSYSTEM_REQUEST,
b.APPLY_GROUP_AUTHORITES,
b.SCIENTIFIC_RESEARCH_ID,
b.SCIENTIFIC_RESEARCH_CODE,
b.SCIENTIFIC_RESEARCH_NAME,
b.PREQUALFY_CODE,
NVL(c.IS_QUICK, '0') AS IS_QUICK,
c.PURCHASE_WAY_CODE,
c.PURCHASE_TYPE_CODE PURCHASE_TYPE_CODE_P,
c.ORIGINAL_TYPE,
c.PURCHASE_REQUEST_BILLS_TYPE,
b.IS_FRAME_CONT_MONAD
FROM PURCHASE_REQUEST_ITEM b
LEFT JOIN PURCHASE_REQUEST c ON b.PURCHASE_REQUEST_ID = c.ID
LEFT JOIN MATERIAL_DATA MMD ON MMD.ID = b.MATERIAL_ID AND MMD.DELETE_FLAG = '0'
WHERE b.delete_flag = '0'
AND b.Item_Status IN (1)
AND b.Deal_User_Id = :1
AND C.MONEY_WAY_CODE = :2
AND C.APPLY_TYPE_CODE = :3
AND C.PAY_OUT_TYPE_CODE = :4
AND C.APPLY_CATEGORY_CODE = :5
AND NVL(C.IS_CARDSYSTEM_REQUEST, '0') = :6
AND EXISTS (
SELECT *
FROM purchase_request_item_log pril
WHERE B.id = pril.purchase_request_item_id
AND pril.lock_status = '1'
AND pril.delete_flag = '0'
)
AND (
c.apply_type_code IN ('20', '41', '3')
OR (
c.apply_type_code NOT IN ('20', '41', '3')
AND NVL(b.Apply_Num, 0) > NVL(b.Deal_Num, 0)
)
)
AND NOT EXISTS (
SELECT *
FROM purchase_request_item p
LEFT JOIN material_province mp ON p.material_id = mp.material_id
WHERE p.delete_flag = 0
AND mp.delete_flag = 0
AND mp.material_status = 03
AND mp.org_code = p.province_code
AND p.id = b.id
)
ORDER BY c.ID, b.ID ASC
)
优化以下这个试图,把2023-06-09作为变量传入,SELECT A.office_id AS distributorId, A.customer_code AS customerCode, A.erp_name AS customerName, A.total_amount AS totalAmount, A.trade_date AS tradeDate, A.customer_type AS customerType, "初次" AS is_first_deal FROM t_first_feedback A WHERE A.`status` = '已成交' AND A.trade_date IS NOT NULL AND A.trade_date >= DATE_FORMAT( DATE_SUB( '2023-06-09', INTERVAL 1 MONTH ), '%Y-%m-01' ) AND A.trade_date <= LAST_DAY( DATE_SUB( '2023-06-09', INTERVAL 1 MONTH )) UNION ALL SELECT B.office_id AS distributorId, B.customer_number AS customerCode, B.customer_name AS customerName, B.total_amount AS totalAmount, B.transaction_date AS tradeDate, "" AS customerType, "重复" AS is_first_deal FROM t_repeat_feedback B WHERE B.transaction_date IS NOT NULL AND B.transaction_date >= DATE_FORMAT( DATE_SUB( '2023-06-09', INTERVAL 1 MONTH ), '%Y-%m-01' ) AND B.transaction_date <= LAST_DAY( DATE_SUB( '2023-06-09', INTERVAL 1 MONTH ))
可以将'2023-06-09'作为参数传入SQL语句,避免硬编码,提高代码的可维护性和复用性。具体实现方式如下:
```
DECLARE @date DATE = '2023-06-09';
SELECT A.office_id AS distributorId,
A.customer_code AS customerCode,
A.erp_name AS customerName,
A.total_amount AS totalAmount,
A.trade_date AS tradeDate,
A.customer_type AS customerType,
"初次" AS is_first_deal
FROM t_first_feedback A
WHERE A.`status` = '已成交'
AND A.trade_date IS NOT NULL
AND A.trade_date >= DATE_FORMAT(DATE_SUB(@date, INTERVAL 1 MONTH), '%Y-%m-01')
AND A.trade_date <= LAST_DAY(DATE_SUB(@date, INTERVAL 1 MONTH))
UNION ALL
SELECT B.office_id AS distributorId,
B.customer_number AS customerCode,
B.customer_name AS customerName,
B.total_amount AS totalAmount,
B.transaction_date AS tradeDate,
"" AS customerType,
"重复" AS is_first_deal
FROM t_repeat_feedback B
WHERE B.transaction_date IS NOT NULL
AND B.transaction_date >= DATE_FORMAT(DATE_SUB(@date, INTERVAL 1 MONTH), '%Y-%m-01')
AND B.transaction_date <= LAST_DAY(DATE_SUB(@date, INTERVAL 1 MONTH));
```
阅读全文