用c++实现pca算法
时间: 2023-08-04 22:11:01 浏览: 48
PCA算法(Principal Component Analysis)是一种常用的数据降维方法。下面是用C++实现PCA算法的示例代码:
```c++
#include <iostream>
#include <vector>
#include <cmath>
using namespace std;
// 计算平均值
vector<double> mean(vector<vector<double>>& data) {
int n = data.size();
int m = data[0].size();
vector<double> res(m, 0);
for (int i = 0; i < n; ++i) {
for (int j = 0; j < m; ++j) {
res[j] += data[i][j];
}
}
for (int i = 0; i < m; ++i) {
res[i] /= n;
}
return res;
}
// 计算协方差矩阵
vector<vector<double>> covariance(vector<vector<double>>& data, vector<double>& mu) {
int n = data.size();
int m = data[0].size();
vector<vector<double>> res(m, vector<double>(m, 0));
for (int i = 0; i < n; ++i) {
for (int j = 0; j < m; ++j) {
for (int k = 0; k < m; ++k) {
res[j][k] += (data[i][j] - mu[j]) * (data[i][k] - mu[k]);
}
}
}
for (int i = 0; i < m; ++i) {
for (int j = 0; j < m; ++j) {
res[i][j] /= n - 1;
}
}
return res;
}
// 计算特征值和特征向量
void eig(vector<vector<double>>& A, vector<double>& eigval, vector<vector<double>>& eigvec) {
int n = A.size();
int m = A[0].size();
if (n != m) {
cerr << "The matrix is not square!" << endl;
exit(1);
}
vector<double> d(n);
vector<vector<double>> v(n, vector<double>(n, 0));
for (int i = 0; i < n; ++i) {
for (int j = 0; j < n; ++j) {
v[i][j] = A[i][j];
}
}
tred2(v, d);
tql2(v, d);
eigval.resize(n);
eigvec.resize(n, vector<double>(n, 0));
for (int i = 0; i < n; ++i) {
eigval[i] = d[i];
for (int j = 0; j < n; ++j) {
eigvec[i][j] = v[j][i];
}
}
}
// 对称矩阵的特征值和特征向量
void tred2(vector<vector<double>>& v, vector<double>& d) {
int n = v.size();
for (int j = 0; j < n; ++j) {
d[j] = v[n - 1][j];
}
for (int i = n - 1; i > 0; --i) {
double scale = 0;
double h = 0;
for (int k = 0; k < i; ++k) {
scale += abs(d[k]);
}
if (scale == 0) {
continue;
}
double f = 0;
double inv_scale = 1 / scale;
for (int k = 0; k < i; ++k) {
d[k] *= inv_scale;
f += d[k] * d[k];
}
double g = sqrt(f);
if (d[i - 1] > 0) {
g = -g;
}
h = f - d[i - 1] * g;
for (int j = 0; j < i; ++j) {
v[j][i - 1] = 0;
}
if (h == 0) {
continue;
}
h = 1 / h;
for (int j = 0; j < i; ++j) {
double f = 0;
for (int k = 0; k < i; ++k) {
f += d[k] * v[k][j];
}
double tmp = f * h;
for (int k = 0; k < i; ++k) {
v[k][j] -= tmp * d[k];
}
}
for (int k = 0; k < i; ++k) {
d[k] *= scale;
}
}
for (int i = 0; i < n - 1; ++i) {
v[n - 1][i] = v[i][i];
v[i][i] = 1;
double h = d[i + 1];
if (h != 0) {
for (int k = 0; k <= i; ++k) {
d[k] = v[k][i + 1] / h;
}
for (int j = 0; j <= i; ++j) {
double f = 0;
for (int k = 0; k <= i; ++k) {
f += v[k][i + 1] * v[k][j];
}
for (int k = 0; k <= i; ++k) {
v[k][j] -= f * d[k];
}
}
}
for (int k = 0; k <= i; ++k) {
v[k][i + 1] = 0;
}
}
for (int j = 0; j < n; ++j) {
d[j] = v[n - 1][j];
v[n - 1][j] = 0;
}
v[n - 1][n - 1] = 1;
}
// 对称三对角矩阵的特征值和特征向量
void tql2(vector<vector<double>>& v, vector<double>& d) {
int n = v.size();
for (int i = 0; i < n - 1; ++i) {
v[i + 1][i] = v[i][i + 1];
}
double f = 0;
double tst1 = 0;
double eps = pow(2, -52);
for (int l = 0; l < n; ++l) {
tst1 = max(tst1, abs(d[l]) + abs(v[l][l + 1]));
int m = l;
while (m < n) {
if (abs(v[m][l + 1]) <= eps * tst1) {
break;
}
++m;
}
if (m > l) {
int iter = 0;
do {
++iter;
double g = d[l];
double p = (d[l + 1] - g) / (2 * v[l][l + 1]);
double r = sqrt(p * p + 1);
if (p < 0) {
r = -r;
}
d[l] = v[l][l + 1] / (p + r);
d[l + 1] = v[l][l + 1] * (p + r);
double dl1 = d[l + 1];
double h = g - d[l];
for (int i = l + 2; i < n; ++i) {
d[i] -= h;
}
f += h;
p = d[m];
double c = 1;
double c2 = c;
double c3 = c;
double el1 = v[l + 1][l];
double s = 0;
double s2 = 0;
for (int i = m - 1; i >= l; --i) {
c3 = c2;
c2 = c;
s2 = s;
g = c * v[i][l + 1] + s * dl1;
h = c * dl1 - s * v[i][l + 1];
dl1 = c * d[i] - s * g;
d[i] = s * d[i] + c * g;
g = c * v[i][l] + s * p;
h = c * p - s * v[i][l];
p = c * d[i] - s * g;
d[i] = s * d[i] + c * g;
s = v[i][l];
c = v[i][l + 1];
if (abs(dl1) > abs(s)) {
c = s / dl1;
r = sqrt(c * c + 1);
s = 1 / r;
c = c * s;
} else {
s = dl1 / s;
r = sqrt(s * s + 1);
c = 1 / r;
s = s * c;
}
double tmp = c2 * p + s2 * dl1;
v[i + 1][l] = s2 * p - c2 * dl1;
v[i + 1][l + 1] = c2 * c3 * v[i][l + 1] - s2 * c3 * v[i][l] + s * el1;
el1 = s * v[i][l + 1] + c * el1;
}
p = -s * s2 * c3 * c * d[l] - s * c2 * el1;
d[l] = s * s2 * c3 * el1 - c * c2 * d[l];
d[l + 1] = p;
} while (abs(d[l + 1]) > eps * tst1 && iter < 1000);
}
d[l] += f;
v[l][l] = 1;
v[l][l + 1] = 0;
}
for (int i = 0; i < n - 1; ++i) {
v[n - 1][i] = v[i][i];
v[i][i] = 1;
double h = d[i + 1];
if (h != 0) {
for (int k = 0; k <= i; ++k) {
d[k] = v[k][i + 1] / h;
}
for (int j = 0; j <= i; ++j) {
double f = 0;
for (int k = 0; k <= i; ++k) {
f += v[k][i + 1] * v[k][j];
}
for (int k = 0; k <= i; ++k) {
v[k][j] -= f * d[k];
}
}
}
for (int k = 0; k <= i; ++k) {
v[k][i + 1] = 0;
}
}
for (int j = 0; j < n; ++j) {
d[j] = v[n - 1][j];
v[n - 1][j] = 0;
}
v[n - 1][n - 1] = 1;
}
// 计算PCA
void pca(vector<vector<double>>& data, int k, vector<double>& variance, vector<vector<double>>& principal_components) {
vector<double> mu = mean(data);
vector<vector<double>> cov = covariance(data, mu);
vector<double> eigval;
vector<vector<double>> eigvec;
eig(cov, eigval, eigvec);
int m = cov.size();
if (k > m) {
cerr << "The number of principal components is too large!" << endl;
exit(1);
}
variance.resize(k, 0);
principal_components.resize(k, vector<double>(m, 0));
double sum_eigval = 0;
for (int i = 0; i < m; ++i) {
sum_eigval += eigval[i];
}
for (int i = 0; i < k; ++i) {
variance[i] = eigval[m - 1 - i] / sum_eigval;
for (int j = 0; j < m; ++j) {
principal_components[i][j] = eigvec[j][m - 1 - i];
}
}
}
int main() {
// 读入数据
int n, m;
cin >> n >> m;
vector<vector<double>> data(n, vector<double>(m));
for (int i = 0; i < n; ++i) {
for (int j = 0; j < m; ++j) {
cin >> data[i][j];
}
}
// 计算PCA并输出结果
int k;
cin >> k;
vector<double> variance;
vector<vector<double>> principal_components;
pca(data, k, variance, principal_components);
cout << "variance:" << endl;
for (int i = 0; i < k; ++i) {
cout << variance[i] << endl;
}
cout << "principal components:" << endl;
for (int i = 0; i < k; ++i) {
for (int j = 0; j < m; ++j) {
cout << principal_components[i][j] << " ";
}
cout << endl;
}
return 0;
}
```
该代码实现了PCA算法的核心部分,包括计算平均值、协方差矩阵、特征值和特征向量,以及计算PCA。使用时,需要输入数据矩阵和需要保留的主成分个数,输出每个主成分的方差占比和对应的主成分。注意,该代码没有包含数据预处理(如中心化)等步骤,请根据具体情况酌情添加。