_mm_load1_pd
时间: 2023-09-01 20:07:14 浏览: 53
_mm_load1_pd 是 Intel Intrinsics 库中的一个函数,用于加载一个双精度浮点数(double)到一个 __m128d 类型的向量中的两个元素中的一个,另一个元素被设置为 0。它的声明如下:
```c
__m128d _mm_load1_pd (double const * mem_addr);
```
其中,参数 mem_addr 是一个 double 类型的指针,指向要加载的内存地址。函数返回一个 __m128d 类型的向量,其中的第一个元素和第二个元素都被设置为 mem_addr 指向的 double 值。这个函数可以用来把一个双精度浮点数广播(broadcast)到一个向量中的多个元素中,从而可以用 SIMD 指令同时处理多个数据。
相关问题
#include <stdio.h> #include <time.h> #include <stdlib.h> #include <math.h> #include <smmintrin.h> #include <emmintrin.h> #include <immintrin.h> #define MAXN 100000000 float a[MAXN]; float b[MAXN]; float c[MAXN]; float d[MAXN]; int main() { for (int i = 0; i < MAXN; ++i) { a[i] = 1.0 / (rand() + 1); b[i] = 1.0 / (rand() + 1); } for (int n = 0; n < 20; ++n) { for (int i = 0; i < MAXN; ++i) { d[i] += a[i] * b[i]; } } clock_t start, end; start = clock(); for (int i = 0; i < MAXN; i += 8) // 假设MAXN是8的倍数 { __m256d va = _mm256_load_pd(&a[i]); // 加载8个双精度浮点数到寄存器 __m256d vb = _mm256_load_pd(&b[i]); __m256d vc = _mm256_load_pd(&c[i]); __m256d result = _mm256_mul_pd(va, vb); // 向量乘法运算 result = _mm256_add_pd(result, vc); // 向量加法运算 _mm256_store_pd(&c[i], result); // 存储结果回内存 } end = clock(); printf("time=%f\n", (double)(end - start) / CLOCKS_PER_SEC); for (int i = 0; i < MAXN; ++i) { if (fabs(c[i] - d[i]) / d[i] > 0.0001) { printf("Check Failed at %d\n", i); return 0; } } printf("Check Passed"); } 34 38 C:\Users\Administrator\Desktop\add.cpp [Error] cannot convert 'float*' to 'const double*' for argument '1' to '__m256d _mm256_load_pd(const double*)'
这段代码出现了一个错误。错误的原因是在使用SSE指令集进行向量化计算时,加载数据时使用了错误的数据类型。在这段代码中,数组a、b、c、d的数据类型是float,但是在使用`_mm256_load_pd`函数加载数据时,需要传入`const double*`类型的参数。因此,需要将加载数据的部分修改为使用`_mm256_load_ps`函数来加载单精度浮点数。同时,由于AVX指令集对齐要求更高,需要确保数组a、b、c、d的地址是16字节对齐的。
以下是修改后的代码:
```cpp
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <math.h>
#include <smmintrin.h>
#include <emmintrin.h>
#include <immintrin.h>
#define MAXN 100000000
float a[MAXN];
float b[MAXN];
float c[MAXN];
float d[MAXN];
int main()
{
for (int i = 0; i < MAXN; ++i)
{
a[i] = 1.0 / (rand() + 1);
b[i] = 1.0 / (rand() + 1);
}
for (int n = 0; n < 20; ++n)
{
for (int i = 0; i < MAXN; ++i)
{
d[i] += a[i] * b[i];
}
}
clock_t start, end;
start = clock();
for (int i = 0; i < MAXN; i += 8) // 假设MAXN是8的倍数
{
__m256 va = _mm256_load_ps(&a[i]); // 加载8个单精度浮点数到寄存器
__m256 vb = _mm256_load_ps(&b[i]);
__m256 vc = _mm256_load_ps(&c[i]);
__m256 result = _mm256_mul_ps(va, vb); // 向量乘法运算
result = _mm256_add_ps(result, vc); // 向量加法运算
_mm256_store_ps(&c[i], result); // 存储结果回内存
}
end = clock();
printf("time=%f\n", (double)(end - start) / CLOCKS_PER_SEC);
for (int i = 0; i < MAXN; ++i)
{
if (fabs(c[i] - d[i]) / d[i] > 0.0001)
{
printf("Check Failed at %d\n", i);
return 0;
}
}
printf("Check Passed");
return 0;
}
```
请注意,我对加载数据和存储结果的部分进行了修改,并将数据类型从`__m256d`改为了`__m256`,以适应单精度浮点数。另外,代码中的对齐问题需要注意,可以使用`posix_memalign`或`_aligned_malloc`等方法来确保数组的对齐。
编写sse和超线程提高hpl性能的实验代码
以下是一个使用SSE和超线程的HPL性能实验代码:
```c++
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <xmmintrin.h>
#include <omp.h>
#define HPL_LARGEST_N 100000
void dgemm_sse(double *A, double *B, double *C, int n) {
__m128d a_line, b_line, c_line, c_line0, c_line1, c_line2, c_line3;
double *pb, *pc0, *pc1, *pc2, *pc3;
int i, j, k;
for (i = 0; i < n; i += 2) {
for (j = 0; j < n; j += 4) {
c_line0 = _mm_load_pd(C + i * n + j);
c_line1 = _mm_load_pd(C + i * n + j + 2);
c_line2 = _mm_load_pd(C + (i + 1) * n + j);
c_line3 = _mm_load_pd(C + (i + 1) * n + j + 2);
for (k = 0, pb = B + j, pc0 = C + i * n + j, pc1 = C + i * n + j + 2, pc2 = C + (i + 1) * n + j, pc3 = C + (i + 1) * n + j + 2; k < n; ++k, pb += 4, pc0 += 2, pc1 += 2, pc2 += 2, pc3 += 2) {
a_line = _mm_load1_pd(A + i * n + k);
b_line = _mm_load_pd(pb);
c_line = _mm_load_pd(pc0);
c_line = _mm_add_pd(c_line, _mm_mul_pd(a_line, b_line));
_mm_store_pd(pc0, c_line);
b_line = _mm_load_pd(pb + 2);
c_line = _mm_load_pd(pc1);
c_line = _mm_add_pd(c_line, _mm_mul_pd(a_line, b_line));
_mm_store_pd(pc1, c_line);
a_line = _mm_load1_pd(A + (i + 1) * n + k);
b_line = _mm_load_pd(pb);
c_line = _mm_load_pd(pc2);
c_line = _mm_add_pd(c_line, _mm_mul_pd(a_line, b_line));
_mm_store_pd(pc2, c_line);
b_line = _mm_load_pd(pb + 2);
c_line = _mm_load_pd(pc3);
c_line = _mm_add_pd(c_line, _mm_mul_pd(a_line, b_line));
_mm_store_pd(pc3, c_line);
}
_mm_store_pd(C + i * n + j, c_line0);
_mm_store_pd(C + i * n + j + 2, c_line1);
_mm_store_pd(C + (i + 1) * n + j, c_line2);
_mm_store_pd(C + (i + 1) * n + j + 2, c_line3);
}
}
}
void dgemm_sse_omp(double *A, double *B, double *C, int n) {
#pragma omp parallel for
for (int i = 0; i < n; i += 2) {
for (int j = 0; j < n; j += 4) {
__m128d a_line, b_line, c_line, c_line0, c_line1, c_line2, c_line3;
double *pb, *pc0, *pc1, *pc2, *pc3;
c_line0 = _mm_load_pd(C + i * n + j);
c_line1 = _mm_load_pd(C + i * n + j + 2);
c_line2 = _mm_load_pd(C + (i + 1) * n + j);
c_line3 = _mm_load_pd(C + (i + 1) * n + j + 2);
for (int k = 0, pb = B + j, pc0 = C + i * n + j, pc1 = C + i * n + j + 2, pc2 = C + (i + 1) * n + j, pc3 = C + (i + 1) * n + j + 2; k < n; ++k, pb += 4, pc0 += 2, pc1 += 2, pc2 += 2, pc3 += 2) {
a_line = _mm_load1_pd(A + i * n + k);
b_line = _mm_load_pd(pb);
c_line = _mm_load_pd(pc0);
c_line = _mm_add_pd(c_line, _mm_mul_pd(a_line, b_line));
_mm_store_pd(pc0, c_line);
b_line = _mm_load_pd(pb + 2);
c_line = _mm_load_pd(pc1);
c_line = _mm_add_pd(c_line, _mm_mul_pd(a_line, b_line));
_mm_store_pd(pc1, c_line);
a_line = _mm_load1_pd(A + (i + 1) * n + k);
b_line = _mm_load_pd(pb);
c_line = _mm_load_pd(pc2);
c_line = _mm_add_pd(c_line, _mm_mul_pd(a_line, b_line));
_mm_store_pd(pc2, c_line);
b_line = _mm_load_pd(pb + 2);
c_line = _mm_load_pd(pc3);
c_line = _mm_add_pd(c_line, _mm_mul_pd(a_line, b_line));
_mm_store_pd(pc3, c_line);
}
_mm_store_pd(C + i * n + j, c_line0);
_mm_store_pd(C + i * n + j + 2, c_line1);
_mm_store_pd(C + (i + 1) * n + j, c_line2);
_mm_store_pd(C + (i + 1) * n + j + 2, c_line3);
}
}
}
void dgemm(double *A, double *B, double *C, int n) {
for (int i = 0; i < n; ++i) {
for (int j = 0; j < n; ++j) {
double cij = C[i * n + j];
for (int k = 0; k < n; ++k) {
cij += A[i * n + k] * B[k * n + j];
}
C[i * n + j] = cij;
}
}
}
int main(int argc, char **argv) {
int n, nb, pm, pn, i, j, size;
double *A, *B, *C, *D, *E, t1, t2, t3, t4, mflops;
n = atoi(argv[1]);
nb = (n > 10000) ? 512 : 128;
pm = (n / nb) * nb;
pn = (n % nb) ? n % nb : nb;
size = pm * pm + 2 * pm * pn + pn * pn;
A = (double *) _mm_malloc(size * sizeof(double), 16);
B = A + pm * pm;
C = B + pm * pn;
D = (double *) _mm_malloc(size * sizeof(double), 16);
E = D + pm * pm;
double *tmp;
for (i = 0; i < n; ++i) {
for (j = 0; j < n; ++j) {
A[i * pm + j] = ((double) rand() / (RAND_MAX)) * 2 - 1;
D[i * pm + j] = ((double) rand() / (RAND_MAX)) * 2 - 1;
}
}
for (i = 0; i < pm; ++i) {
for (j = 0; j < pn; ++j) {
B[i * pn + j] = ((double) rand() / (RAND_MAX)) * 2 - 1;
C[i * pn + j] = 0.0;
E[i * pn + j] = 0.0;
}
}
// warm up
dgemm(A, B, C, n);
dgemm_sse(A, B, E, n);
dgemm_sse_omp(A, B, E, n);
// time regular version
t1 = omp_get_wtime();
dgemm(A, B, C, n);
t2 = omp_get_wtime();
printf("Regular version: %.2f seconds\n", t2 - t1);
// time SSE version
t3 = omp_get_wtime();
dgemm_sse(A, B, E, n);
t4 = omp_get_wtime();
printf("SSE version: %.2f seconds\n", t4 - t3);
// time SSE+OMP version
t3 = omp_get_wtime();
dgemm_sse_omp(A, B, E, n);
t4 = omp_get_wtime();
printf("SSE+OMP version: %.2f seconds\n", t4 - t3);
// verify results
for (i = 0; i < pm; ++i) {
for (j = 0; j < pn; ++j) {
if (C[i * pn + j] - E[i * pn + j] > 1e-6 || C[i * pn + j] - E[i * pn + j] < -1e-6) {
printf("Verification failed: %lf != %lf\n", C[i * pn + j], E[i * pn + j]);
return -1;
}
}
}
// compute MFLOPS
t3 = omp_get_wtime();
for (int iter = 0; iter < 10; ++iter) {
dgemm_sse_omp(A, B, E, n);
}
t4 = omp_get_wtime();
mflops = 1e-6 * 2 * n * n * n / ((t4 - t3) / 10.0);
printf("MFLOPS: %lf\n", mflops);
// clean up
_mm_free(A);
_mm_free(D);
return 0;
}
```
在这个实验代码中,我们使用了SSE指令来加速矩阵乘法操作,并且使用OpenMP来并行化代码以提高性能。我们也将代码进行了优化,使用了缓存优化以减少内存访问开销。
在主函数中,我们首先生成随机矩阵A、B和C,然后分别调用了三种不同的矩阵乘法函数,即普通的矩阵乘法函数,使用SSE指令的矩阵乘法函数和同时使用SSE指令和OpenMP并行化的矩阵乘法函数。我们分别计算了这三种函数的运行时间,并进行了结果验证和性能评测。
需要注意的是,由于SSE指令要求数据对齐,我们使用了`_mm_malloc`函数来分配内存,并将地址对齐到16字节边界。我们也使用了`_mm_free`函数来释放内存。