StandardScaler源码
时间: 2024-01-17 17:05:49 浏览: 175
StandardScaler是scikit-learn中的一个类,用于将数据集进行标准化处理(即将数据集按照特征的均值和方差进行缩放)。下面是StandardScaler的源码:
```python
class StandardScaler(BaseEstimator, TransformerMixin):
"""Standardize features by removing the mean and scaling to unit variance
The standard score of a sample `x` is calculated as:
z = (x - u) / s
where `u` is the mean of the training samples and `s` is the standard
deviation of the training samples.
Parameters
----------
copy : bool, default=True
If False, try to avoid a copy and do inplace scaling instead.
This is not guaranteed to always work inplace; e.g. if the data is
not a NumPy array, it may still remain unchanged.
with_mean : bool, default=True
If True, center the data before scaling.
This does not work (and will raise an exception) when attempted on
sparse matrices.
with_std : bool, default=True
If True, scale the data to unit variance (or equivalently,
unit standard deviation).
Attributes
----------
scale_ : ndarray of shape (n_features,)
Per feature relative scaling of the data.
.. versionadded:: 0.17
*scale_*
mean_ : ndarray of shape (n_features,)
The mean value for each feature in the training set.
var_ : ndarray of shape (n_features,)
The variance for each feature in the training set. Used to compute
`scale_`
n_samples_seen_ : int or ndarray of shape (n_features,)
The number of samples processed by the estimator for each feature.
If there are not missing samples, the n_samples_seen will be an
integer, otherwise it will be an array with dtype int. By default,
this will be initialized to zero, and incremented each time a new
sample is processed. If `sample_weight` is not None, then the
`n_samples_seen` will be weighted similarly.
.. versionadded:: 0.24
.. versionchanged:: 1.0
`n_samples_seen_` is initialized to zero by default.
Examples
--------
>>> from sklearn.preprocessing import StandardScaler
>>> import numpy as np
>>> X = np.array([[-1., -1., 2.],
... [-2., -1., 4.],
... [-3., -2., 1.]])
>>> scaler = StandardScaler()
>>> scaler.fit(X)
StandardScaler()
>>> print(scaler.mean_)
[-2. -1.33333333 2.33333333]
>>> print(scaler.scale_)
[0.81649658 0.47140452 1.24721913]
>>> print(scaler.transform(X))
[[ 0.26726124 0. 0.12309149]
[ 0. 0. 1.33630621]
[-0.26726124 -1. -1.4593977 ]]
"""
def __init__(self, *, copy=True, with_mean=True, with_std=True):
self.with_mean = with_mean
self.with_std = with_std
self.copy = copy
def fit(self, X, y=None):
"""Compute the mean and std to be used for later scaling.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The data used to compute the mean and standard deviation
used for later scaling along the features axis.
y : None
Ignored
Returns
-------
self : object
Returns self.
"""
if self.with_mean:
self.mean_ = np.mean(X, axis=0)
else:
self.mean_ = None
if self.with_std:
self.var_ = np.var(X, axis=0)
self.scale_ = np.sqrt(self.var_)
else:
self.var_ = None
self.scale_ = None
self.n_samples_seen_ = _num_samples(X)
return self
def transform(self, X, y='deprecated', copy=None):
"""Perform standardization by centering and scaling
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The data to transform, row by row.
If a sparse matrix is provided, it will be converted into
a sparse ``csr_matrix``. **However, CSC or COO matrices are
not supported yet**.
y : (ignored)
.. deprecated:: 0.24
`y` parameter is deprecated and will be removed in v1.1
Use sklearn.compose.TransformedTargetRegressor instead.
copy : bool, default=None
Copy the input X or not.
Returns
-------
X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
Transformed array.
"""
check_is_fitted(self)
if isinstance(X, (SparseArray, spmatrix)):
if self.with_mean:
raise ValueError(
"Cannot center sparse matrices: use `with_mean=False` "
"instead. See docstring for motivation and alternatives.")
if self.scale_ is not None:
inplace_column_scale(X, 1 / self.scale_)
else:
X = check_array(X, accept_sparse='csc', copy=copy,
estimator=self, dtype=FLOAT_DTYPES,
force_all_finite='allow-nan')
if self.with_mean:
X -= self.mean_
if self.with_std:
X /= self.scale_
return X
def inverse_transform(self, X, copy=None):
"""Scale back the data to the original representation
Parameters
----------
X : {ndarray, sparse matrix} of shape (n_samples, n_features)
The data to transform back, row by row.
copy : bool, default=None
Copy the input X or not.
Returns
-------
X_original : {ndarray, sparse matrix} of shape (n_samples, n_features)
Transformed data.
"""
check_is_fitted(self)
if isinstance(X, (SparseArray, spmatrix)):
if self.with_mean:
raise ValueError(
"Cannot center sparse matrices: use `with_mean=False` "
"instead. See docstring for motivation and alternatives.")
if self.scale_ is not None:
inplace_column_scale(X, self.scale_)
else:
X = check_array(X, accept_sparse='csc', copy=copy,
estimator=self, dtype=FLOAT_DTYPES,
force_all_finite='allow-nan')
if self.with_std:
X *= self.scale_
if self.with_mean:
X += self.mean_
return X
def _more_tags(self):
return {'allow_nan': True}
```
在上面的代码中,fit()方法用于计算均值和标准差,transform()方法用于将数据集进行标准化处理,inverse_transform()方法用于将标准化后的数据集进行还原,_more_tags()方法是scikit-learn的内部方法,用于提供更多的元数据信息。注意,StandardScaler会对每个特征进行标准化处理,因此它只能应用于连续型特征,而不能应用于类别型特征。
阅读全文