使以下程序减小占用内存,并能准确显示结果:#include <stdio.h> #include <stdlib.h> #include <mpi.h> #define N 6 int main(int argc, char** argv) { int rank, size; int A[N][N], B[N][N], C[N][N], sub_A[N / N][N], sub_B[N / N][N], sub_C[N / N][N]; int i, j, k, l, m, n; MPI_Status status; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); if (N % size != 0) { if (rank == 0) { printf("Matrix size should be multiple of number of processes\n"); } MPI_Finalize(); return 0; } if (rank == 0) { for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { A[i][j] = i * j; B[i][j] = i + j; } } } MPI_Scatter(A, N * N / size, MPI_INT, sub_A, N * N / size, MPI_INT, 0, MPI_COMM_WORLD); MPI_Scatter(B, N * N / size, MPI_INT, sub_B, N * N / size, MPI_INT, 0, MPI_COMM_WORLD); for (i = 0; i < N / N; i++) { for (j = 0; j < N; j++) { sub_C[i][j] = 0; } } for (i = 0; i < N / N; i++) { for (j = 0; j < N; j++) { for (k = 0; k < N / N; k++) { sub_C[i][j] += sub_A[i][k] * sub_B[k][j]; } } } MPI_Gather(sub_C, N * N / size, MPI_INT, C, N * N / size, MPI_INT, 0, MPI_COMM_WORLD); if (rank == 0) { for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { for (k = 0; k < size; k++) { for (l = 0; l < N / N; l++) { m = i % (N / N) + l * (N / N); n = j + k * N / N; sub_C[l][j] = C[m][n]; } } for (k = 0; k < N / N; k++) { for (l = 0; l < size; l++) { C[i][j] += sub_C[k][j + l * N / N]; } } } } } if (rank == 0) { for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { printf("%d ", C[i][j]); } printf("\n"); } } MPI_Finalize(); return 0; }
时间: 2024-02-15 17:27:02 浏览: 91
[33]-15 NV12数据格式转H265编码格式实现过程.pdf
这个程序可以通过使用更小的子矩阵来减小内存占用。同时,可以将子矩阵的大小设置为更接近于处理器数量的值,以减少通信的开销。下面是修改后的程序:
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#define N 6
int main(int argc, char** argv) {
int rank, size;
int A[N][N], B[N][N], C[N][N], sub_A[N / 2][N], sub_B[N][N / 2], sub_C[N / 2][N / 2];
int i, j, k, l, m, n;
MPI_Status status;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
if (N % size != 0) {
if (rank == 0) {
printf("Matrix size should be multiple of number of processes\n");
}
MPI_Finalize();
return 0;
}
if (rank == 0) {
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
A[i][j] = i * j;
B[i][j] = i + j;
}
}
}
MPI_Scatter(A, N * N / size, MPI_INT, sub_A, N / 2 * N, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Scatter(B, N * N / size, MPI_INT, sub_B, N * N / 2, MPI_INT, 0, MPI_COMM_WORLD);
for (i = 0; i < N / 2; i++) {
for (j = 0; j < N / 2; j++) {
sub_C[i][j] = 0;
}
}
for (i = 0; i < N / 2; i++) {
for (j = 0; j < N / 2; j++) {
for (k = 0; k < N; k++) {
sub_C[i][j] += sub_A[i][k] * sub_B[k][j];
}
}
}
MPI_Gather(sub_C, N / 2 * N / 2, MPI_INT, C, N / 2 * N / 2, MPI_INT, 0, MPI_COMM_WORLD);
if (rank == 0) {
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
C[i][j] = 0;
}
}
for (i = 0; i < N / 2; i++) {
for (j = 0; j < N / 2; j++) {
for (k = 0; k < size; k++) {
m = i + k * N / 2;
n = j * N / 2;
for (l = 0; l < N / 2; l++) {
C[m][n + l] = sub_C[i][j];
}
}
}
}
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
printf("%d ", C[i][j]);
}
printf("\n");
}
}
MPI_Finalize();
return 0;
}
阅读全文