对于#include <stdio.h> #include <arm_neon.h> static void matrix_mul_asm(uint16_t **aa,uint16_t **bb,uint16_t **cc) { uint16_t *a=(uint16_t *)aa; uint16_t *b=(uint16_t )bb; uint16_t c=(uint16_t )cc; asm volatile ( "ld4 {v0.4h-v3.4h},[%0]\n" "ld4 {v4.4h,v5.4h,v6.4h,v7.4h},[%1]\n" "mul v3.4h,v3.4h,v7.4h\n" "mul v2.4h,v2.4h,v6.4h\n" "mul v1.4h,v1.4h,v5.4h\n" "mul v0.4h,v0.4h,v4.4h\n" "st4 {v0.4h,v1.4h,v2.4h,v3.4h},[%2]\n" :"+r"(a),"+r"(b),"+r"(c) : :"cc","memory","v0","v1","v2","v3","v4","v5","v6","v7" ); } int main() { uint16_t aa[4][4]={ {1,8,2,4}, //自己学号的高四位,比如学号为19374331,此处应为{1,9,3,7}, {1,0,5,5}, //自己学号的低四位,比如学号为19374331,此处应为{4,3,3,1}, {3,6,8,1}, {2,6,7,1} }; uint16_t bb[4][4]={ {1,3,5,7}, {2,4,6,8}, {2,5,7,9}, {5,2,7,1} }; uint16_t cc[4][4]={0}; int i,j; matrix_mul_asm((uint16_t **)aa,(uint16_t **)bb,(uint16_t **)cc); for(i=0;i<4;i++) for(j=0;j<4;j++) printf("NO 18241055,cc[%1u][ %1u] out is %11u \n",i,j,cc[i][j]); //19374331位置处填写自己学号 return 0; }用NEON SIMD的intrinsics方式重新一遍
时间: 2024-04-26 20:23:49 浏览: 103
#include<stdio.h>
下面是使用NEON SIMD的intrinsics方式实现的矩阵乘法代码:
```
#include <stdio.h>
#include <arm_neon.h>
static void matrix_mul_simd(uint16_t **aa, uint16_t **bb, uint16_t **cc) {
uint16x8_t a[4], b[4];
uint16x8_t c[4][4] = {0};
for (int i = 0; i < 4; i++) {
a[i] = vld1q_u16(aa[i]);
}
for (int i = 0; i < 4; i++) {
b[i] = vld1q_u16(bb[i]);
c[0][i] = vmulq_u16(a[0], vdupq_n_u16(bb[i][0]));
c[1][i] = vmulq_u16(a[1], vdupq_n_u16(bb[i][1]));
c[2][i] = vmulq_u16(a[2], vdupq_n_u16(bb[i][2]));
c[3][i] = vmulq_u16(a[3], vdupq_n_u16(bb[i][3]));
}
for (int i = 0; i < 4; i++) {
c[i][0] = vaddq_u16(c[i][0], vmulq_u16(a[i], b[0]));
c[i][1] = vaddq_u16(c[i][1], vmulq_u16(a[i], b[1]));
c[i][2] = vaddq_u16(c[i][2], vmulq_u16(a[i], b[2]));
c[i][3] = vaddq_u16(c[i][3], vmulq_u16(a[i], b[3]));
}
for (int i = 0; i < 4; i++) {
vst1q_u16(cc[i], c[i][0]);
vst1q_u16(cc[i] + 8, c[i][1]);
vst1q_u16(cc[i] + 16, c[i][2]);
vst1q_u16(cc[i] + 24, c[i][3]);
}
}
int main() {
uint16_t aa[4][4] = {
{1, 8, 2, 4}, //自己学号的高四位,比如学号为19374331,此处应为{1,9,3,7},
{1, 0, 5, 5}, //自己学号的低四位,比如学号为19374331,此处应为{4,3,3,1},
{3, 6, 8, 1},
{2, 6, 7, 1}
};
uint16_t bb[4][4] = {
{1, 3, 5, 7},
{2, 4, 6, 8},
{2, 5, 7, 9},
{5, 2, 7, 1}
};
uint16_t cc[4][4] = {0};
int i, j;
matrix_mul_simd((uint16_t **) aa, (uint16_t **) bb, (uint16_t **) cc);
for (i = 0; i < 4; i++) {
for (j = 0; j < 4; j++) {
printf("NO 18241055,cc[%1u][ %1u] out is %11u \n", i, j, cc[i][j]); //19374331位置处填写自己学号
}
}
return 0;
}
```
在这个实现中,我们使用了NEON SIMD的intrinsics,首先将输入矩阵a和b分别加载到了8个uint16x8_t向量变量a和b中,然后使用vmulq_u16函数分别计算a和b的每一行的乘积,将结果存储到c矩阵中。最后,使用vaddq_u16函数将结果相加,并使用vst1q_u16函数将结果存储到输出矩阵cc中。这种方法比使用汇编代码更加容易理解和维护,并且可以通过编译器自动向量化来进一步优化性能。
阅读全文