如何使用GPU为CPU运算加速,请编写一个示例
时间: 2024-02-23 17:00:26 浏览: 127
这里提供一个使用CUDA编写的矩阵乘法示例,利用GPU加速CPU运算:
```c++
#include <stdio.h>
#include <cuda_runtime.h>
#define N 1024
#define BLOCK_SIZE 16
__global__ void matrixMul(float *a, float *b, float *c, int n)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < n && col < n)
{
float sum = 0;
for (int i = 0; i < n; i++)
{
sum += a[row * n + i] * b[i * n + col];
}
c[row * n + col] = sum;
}
}
int main()
{
float *h_a, *h_b, *h_c;
float *d_a, *d_b, *d_c;
size_t size = N * N * sizeof(float);
// Allocate memory on host and device
h_a = (float *)malloc(size);
h_b = (float *)malloc(size);
h_c = (float *)malloc(size);
cudaMalloc(&d_a, size);
cudaMalloc(&d_b, size);
cudaMalloc(&d_c, size);
// Initialize matrices on host
for (int i = 0; i < N * N; i++)
{
h_a[i] = 1.0f;
h_b[i] = 1.0f;
}
// Copy matrices from host to device
cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);
// Launch kernel
dim3 threadsPerBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);
matrixMul<<<numBlocks, threadsPerBlock>>>(d_a, d_b, d_c, N);
// Copy result from device to host
cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);
// Verify result
for (int i = 0; i < N * N; i++)
{
if (h_c[i] != N)
{
printf("Error: element %d (%.1f) != %d\n", i, h_c[i], N);
break;
}
}
// Free memory
free(h_a);
free(h_b);
free(h_c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
```
在上述示例中,我们使用CUDA编写了一个矩阵乘法的核函数`matrixMul`,并在主函数中调用该函数实现矩阵乘法。在调用`matrixMul`函数时,我们使用了CUDA提供的执行配置函数`dim3`,将矩阵划分为若干个线程块和线程,并通过调用`<<<numBlocks, threadsPerBlock>>>`来启动核函数。在核函数中,我们使用了GPU并行计算的方式,通过每个线程计算矩阵的一个元素,从而充分利用了GPU的并行计算能力。最后,我们通过调用`cudaMemcpy`将计算结果从设备内存复制回主机内存,并在主机上验证了计算结果的正确性。
阅读全文