java代码实现Kmeans聚类算法,训练集的文件类型为xls,java所有代码都写在k4.java中
时间: 2024-05-04 09:16:39 浏览: 108
好的,我了解了您的要求。以下是Java代码实现K-means聚类算法,可以读取xls格式的训练集文件,并且所有代码都写在k4.java中:
```java
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.WorkbookFactory;
public class k4 {
public static void main(String[] args) throws IOException {
int k = 3; // 聚类数
int maxIter = 100; // 最大迭代次数
double[][] dataSet = readExcel("data.xls"); // 读取训练集
int m = dataSet.length; // 样本数
int n = dataSet[0].length; // 特征数
int[] clusterAssment = new int[m]; // 记录每个样本所属的簇
double[][] centroids = randCent(dataSet, k); // 随机初始化簇质心
for (int i = 0; i < maxIter; i++) {
boolean clusterChanged = false;
// 遍历每个样本,将其归入最近的簇
for (int j = 0; j < m; j++) {
double minDist = Double.MAX_VALUE;
int minIndex = -1;
for (int l = 0; l < k; l++) {
double dist = euclDistance(dataSet[j], centroids[l]);
if (dist < minDist) {
minDist = dist;
minIndex = l;
}
}
if (clusterAssment[j] != minIndex) {
clusterChanged = true;
clusterAssment[j] = minIndex;
}
}
// 重新计算每个簇的质心
for (int l = 0; l < k; l++) {
List<double[]> pointsInCluster = new ArrayList<double[]>();
for (int j = 0; j < m; j++) {
if (clusterAssment[j] == l) {
pointsInCluster.add(dataSet[j]);
}
}
if (!pointsInCluster.isEmpty()) {
centroids[l] = mean(pointsInCluster.toArray(new double[pointsInCluster.size()][n]));
}
}
// 如果簇分配不再变化,退出循环
if (!clusterChanged) {
break;
}
}
// 输出最终结果
System.out.println("Cluster centroids:\n");
for (int i = 0; i < k; i++) {
System.out.print("(");
for (int j = 0; j < n; j++) {
System.out.print(centroids[i][j]);
if (j < n - 1) {
System.out.print(", ");
}
}
System.out.println(")");
}
System.out.println("\nCluster assignments:");
for (int i = 0; i < m; i++) {
System.out.printf("Sample %d -> Cluster %d%n", i + 1, clusterAssment[i] + 1);
}
}
// 计算两个样本之间的欧氏距离
private static double euclDistance(double[] vec1, double[] vec2) {
double sum = 0.0;
for (int i = 0; i < vec1.length; i++) {
sum += Math.pow(vec1[i] - vec2[i], 2);
}
return Math.sqrt(sum);
}
// 随机初始化簇质心
private static double[][] randCent(double[][] dataSet, int k) {
int n = dataSet[0].length;
double[][] centroids = new double[k][n];
Random rand = new Random();
for (int i = 0; i < k; i++) {
int index = rand.nextInt(dataSet.length);
for (int j = 0; j < n; j++) {
centroids[i][j] = dataSet[index][j];
}
}
return centroids;
}
// 计算一组样本的均值
private static double[] mean(double[][] points) {
int n = points[0].length;
double[] mean = new double[n];
for (int i = 0; i < n; i++) {
double sum = 0.0;
for (double[] point : points) {
sum += point[i];
}
mean[i] = sum / points.length;
}
return mean;
}
// 从 Excel 文件中读取数据集
private static double[][] readExcel(String filePath) throws IOException {
FileInputStream file = new FileInputStream(filePath);
Workbook workbook = WorkbookFactory.create(file);
Sheet sheet = workbook.getSheetAt(0);
int firstRowNum = sheet.getFirstRowNum();
int lastRowNum = sheet.getLastRowNum();
int firstCellNum = sheet.getRow(firstRowNum).getFirstCellNum();
int lastCellNum = sheet.getRow(firstRowNum).getLastCellNum();
double[][] dataSet = new double[lastRowNum - firstRowNum + 1][lastCellNum - firstCellNum];
for (int i = firstRowNum; i <= lastRowNum; i++) {
Row row = sheet.getRow(i);
for (int j = firstCellNum; j < lastCellNum; j++) {
Cell cell = row.getCell(j);
if (cell.getCellType() == Cell.CELL_TYPE_NUMERIC) {
dataSet[i - firstRowNum][j - firstCellNum] = cell.getNumericCellValue();
} else {
throw new IOException("Invalid cell type in data file");
}
}
}
return dataSet;
}
}
```
注意:以上代码中使用了 Apache POI 库来读取 xls 文件,需要先下载并添加到项目中。另外,此代码实现的 K-means 算法是经典的 Lloyd 算法,对大数据集和高维数据的效果可能不佳,建议使用优化后的 K-means 算法或其他聚类算法。
阅读全文