SIMD_WIDTH 是什么
时间: 2024-07-20 17:00:41 浏览: 136
SIMD_WIDTH 是一个计算机编程中的术语,它代表单指令多数据(Single Instruction Multiple Data)宽度。在处理单元设计中,SIMD(Single Instruction Multiple Stream)技术用于并行处理多个数据元素,一次执行一条指令。SIMD_WIDTH 指的是这些同时处理的数据元素的数量,通常用位(bit)或字节(byte)来度量。更宽的 SIMD_WIDTH 提高了执行相同操作时的并行性和性能,适用于大量数值运算密集型任务,如图像处理、科学计算等。在不同的架构和编程语言中,SIMD_WIDTH 可能有不同的值,比如在 ARM 或者 AVX 架构中,你可以看到像 `__vector_width` 这样的宏来表示当前平台的 SIMD 宽度。
相关问题
使用SIMD指令加速warpAffine带参数WARP_INVERSE_MAP例程C++
下面是使用SIMD指令加速warpAffine带参数WARP_INVERSE_MAP例程的C++代码:
```c++
#include <opencv2/opencv.hpp>
#include <emmintrin.h>
using namespace cv;
void warpAffineInverseMap(const Mat& src, Mat& dst, const Mat& M)
{
CV_Assert(M.rows == 2 && M.cols == 3);
dst.create(src.size(), src.type());
int width = src.cols;
int height = src.rows;
int channels = src.channels();
int step = src.step;
int dst_step = dst.step;
float* src_data = (float*)src.data;
float* dst_data = (float*)dst.data;
__m128 m0 = _mm_set1_ps(M.at<double>(0, 0));
__m128 m1 = _mm_set1_ps(M.at<double>(1, 0));
__m128 m2 = _mm_set1_ps(M.at<double>(0, 1));
__m128 m3 = _mm_set1_ps(M.at<double>(1, 1));
__m128 m4 = _mm_set1_ps(M.at<double>(0, 2));
__m128 m5 = _mm_set1_ps(M.at<double>(1, 2));
for (int y = 0; y < height; ++y)
{
const float* src_row = src_data + y * step;
float* dst_row = dst_data + y * dst_step;
for (int x = 0; x < width; x += 4)
{
__m128 fx = _mm_set_ps((float)(x + 3), (float)(x + 2), (float)(x + 1), (float)x);
__m128 fy = _mm_set1_ps((float)y);
__m128 mx = _mm_add_ps(_mm_mul_ps(fx, m0), _mm_mul_ps(fy, m2));
__m128 my = _mm_add_ps(_mm_mul_ps(fx, m1), _mm_mul_ps(fy, m3));
__m128 mz = _mm_add_ps(_mm_add_ps(_mm_mul_ps(fx, m4), _mm_mul_ps(fy, m5)), _mm_set1_ps(1.0f));
__m128 rmx = _mm_div_ps(_mm_set1_ps(1.0f), mz);
__m128 fx2 = _mm_mul_ps(mx, rmx);
__m128 fy2 = _mm_mul_ps(my, rmx);
__m128i ix = _mm_cvtps_epi32(fx2);
__m128i iy = _mm_cvtps_epi32(fy2);
__m128 fx3 = _mm_sub_ps(fx2, _mm_cvtepi32_ps(ix));
__m128 fy3 = _mm_sub_ps(fy2, _mm_cvtepi32_ps(iy));
__m128 w0 = _mm_mul_ps(_mm_sub_ps(_mm_set1_ps(1.0f), fx3), _mm_sub_ps(_mm_set1_ps(1.0f), fy3));
__m128 w1 = _mm_mul_ps(fx3, _mm_sub_ps(_mm_set1_ps(1.0f), fy3));
__m128 w2 = _mm_mul_ps(_mm_sub_ps(_mm_set1_ps(1.0f), fx3), fy3);
__m128 w3 = _mm_mul_ps(fx3, fy3);
__m128i i00 = _mm_add_epi32(_mm_mullo_epi32(iy, _mm_set1_epi32(step)), _mm_mullo_epi32(ix, _mm_set1_epi32(channels)));
__m128i i10 = _mm_add_epi32(i00, _mm_set1_epi32(channels));
__m128i i01 = _mm_add_epi32(i00, _mm_set1_epi32(step));
__m128i i11 = _mm_add_epi32(i01, _mm_set1_epi32(channels));
__m128 v00 = _mm_loadu_ps(src_row + _mm_extract_epi32(i00, 0));
__m128 v10 = _mm_loadu_ps(src_row + _mm_extract_epi32(i10, 0));
__m128 v01 = _mm_loadu_ps(src_row + _mm_extract_epi32(i01, 0));
__m128 v11 = _mm_loadu_ps(src_row + _mm_extract_epi32(i11, 0));
__m128 v0 = _mm_add_ps(_mm_mul_ps(v00, w0), _mm_mul_ps(v10, w1));
__m128 v1 = _mm_add_ps(_mm_mul_ps(v01, w2), _mm_mul_ps(v11, w3));
__m128 v = _mm_add_ps(v0, v1);
_mm_storeu_ps(dst_row + x, v);
}
}
}
```
这个函数使用了SSE指令集,可以大幅提高warpAffineInverseMap函数的速度。
使用getRotationMatrix2D创建变换矩阵通过SIMD指令加速warpAffine带参数WARP_INVERSE_MAP效果例程C++
下面是使用SIMD指令加速warpAffine带参数WARP_INVERSE_MAP效果的例程:
```c++
#include <iostream>
#include <opencv2/opencv.hpp>
using namespace std;
using namespace cv;
// 定义SIMD指令宏
#ifdef __AVX__
#define SIMD_WIDTH 8
#elif __SSE2__
#define SIMD_WIDTH 4
#else
#define SIMD_WIDTH 1
#endif
int main()
{
// 读取图像
Mat src = imread("input.jpg");
if(src.empty())
{
cout << "Could not open or find the image!\n" << endl;
return -1;
}
// 设置目标图像大小
int width = src.cols;
int height = src.rows;
int dst_width = width / 2;
int dst_height = height / 2;
// 定义变换矩阵
Point2f src_points[3];
Point2f dst_points[3];
src_points[0] = Point2f(0, 0);
src_points[1] = Point2f(width - 1, 0);
src_points[2] = Point2f(0, height - 1);
dst_points[0] = Point2f(0, 0);
dst_points[1] = Point2f(dst_width - 1, 0);
dst_points[2] = Point2f(0, dst_height - 1);
Mat warp_mat = getAffineTransform(src_points, dst_points);
// 定义SIMD指令加速所需的数据
int aligned_width = dst_width / SIMD_WIDTH * SIMD_WIDTH;
float* warp_mat_data = (float*)warp_mat.data;
float* warp_mat_data_aligned = (float*)aligned_alloc(SIMD_WIDTH * sizeof(float), aligned_width * sizeof(float));
for(int i = 0; i < dst_height; i++)
{
for(int j = 0; j < aligned_width; j += SIMD_WIDTH)
{
int index_src = i * aligned_width * 3 + j * 3 / SIMD_WIDTH;
int index_dst = i * aligned_width * 2 + j * 2 / SIMD_WIDTH;
for(int k = 0; k < SIMD_WIDTH; k++)
{
warp_mat_data_aligned[index_dst + k * 2 / SIMD_WIDTH] = warp_mat_data[index_src + k * 3 / SIMD_WIDTH];
warp_mat_data_aligned[index_dst + k * 2 / SIMD_WIDTH + 1] = warp_mat_data[index_src + k * 3 / SIMD_WIDTH + 1];
}
}
}
// 定义源图像和目标图像
Mat dst(dst_height, dst_width, src.type());
// 定义SIMD指令加速所需的数据
int src_step = src.step;
int dst_step = dst.step;
uchar* src_data = src.data;
uchar* dst_data = dst.data;
int src_width3 = width * 3;
int dst_width3 = dst_width * 3;
int src_aligned_width3 = aligned_width * 3;
int dst_aligned_width3 = dst_width * 3;
int src_height_minus_1 = height - 1;
int src_width_minus_1 = width - 1;
// 进行变换
for(int i = 0; i < dst_height; i++)
{
float* warp_mat_data_aligned_row = warp_mat_data_aligned + i * aligned_width * 2 / SIMD_WIDTH;
for(int j = 0; j < dst_width; j += SIMD_WIDTH)
{
__m128 x = _mm_set_ps(j + 3, j + 2, j + 1, j + 0);
__m128 y = _mm_set_ps(i, i, i, i);
__m256 warp_mat_data_aligned_v = _mm256_load_ps(warp_mat_data_aligned_row + j * 2 / SIMD_WIDTH);
__m256 warp_mat_data_aligned_v1 = _mm256_broadcast_ss(warp_mat_data_aligned_row + j * 2 / SIMD_WIDTH);
__m256 warp_mat_data_aligned_v2 = _mm256_broadcast_ss(warp_mat_data_aligned_row + j * 2 / SIMD_WIDTH + 1);
__m256 warp_mat_data_aligned_v3 = _mm256_mul_ps(warp_mat_data_aligned_v, _mm256_set1_ps(1));
__m256 warp_mat_data_aligned_v4 = _mm256_mul_ps(warp_mat_data_aligned_v, _mm256_set1_ps(0));
__m256 warp_mat_data_aligned_v5 = _mm256_mul_ps(warp_mat_data_aligned_v, _mm256_set1_ps(-1));
__m256 warp_mat_data_aligned_v6 = _mm256_mul_ps(_mm256_permute2f128_ps(warp_mat_data_aligned_v1, warp_mat_data_aligned_v1, 0x21), _mm256_set1_ps(1));
__m256 warp_mat_data_aligned_v7 = _mm256_mul_ps(_mm256_permute2f128_ps(warp_mat_data_aligned_v1, warp_mat_data_aligned_v1, 0x21), _mm256_set1_ps(0));
__m256 warp_mat_data_aligned_v8 = _mm256_mul_ps(_mm256_permute2f128_ps(warp_mat_data_aligned_v1, warp_mat_data_aligned_v1, 0x21), _mm256_set1_ps(-1));
__m256 warp_mat_data_aligned_v9 = _mm256_mul_ps(_mm256_permute2f128_ps(warp_mat_data_aligned_v2, warp_mat_data_aligned_v2, 0x21), _mm256_set1_ps(1));
__m256 warp_mat_data_aligned_v10 = _mm256_mul_ps(_mm256_permute2f128_ps(warp_mat_data_aligned_v2, warp_mat_data_aligned_v2, 0x21), _mm256_set1_ps(0));
__m256 warp_mat_data_aligned_v11 = _mm256_mul_ps(_mm256_permute2f128_ps(warp_mat_data_aligned_v2, warp_mat_data_aligned_v2, 0x21), _mm256_set1_ps(-1));
__m256 warp_mat_data_aligned_v12 = _mm256_blend_ps(warp_mat_data_aligned_v4, warp_mat_data_aligned_v6, 0x55);
__m256 warp_mat_data_aligned_v13 = _mm256_blend_ps(warp_mat_data_aligned_v5, warp_mat_data_aligned_v7, 0x55);
__m256 warp_mat_data_aligned_v14 = _mm256_blend_ps(warp_mat_data_aligned_v4, warp_mat_data_aligned_v6, 0xAA);
__m256 warp_mat_data_aligned_v15 = _mm256_blend_ps(warp_mat_data_aligned_v5, warp_mat_data_aligned_v7, 0xAA);
__m256 warp_mat_data_aligned_v16 = _mm256_blend_ps(warp_mat_data_aligned_v4, warp_mat_data_aligned_v6, 0xFF);
__m256 warp_mat_data_aligned_v17 = _mm256_blend_ps(warp_mat_data_aligned_v5, warp_mat_data_aligned_v7, 0xFF);
__m256 warp_mat_data_aligned_v18 = _mm256_blend_ps(warp_mat_data_aligned_v10, warp_mat_data_aligned_v12, 0x55);
__m256 warp_mat_data_aligned_v19 = _mm256_blend_ps(warp_mat_data_aligned_v11, warp_mat_data_aligned_v13, 0x55);
__m256 warp_mat_data_aligned_v20 = _mm256_blend_ps(warp_mat_data_aligned_v10, warp_mat_data_aligned_v12, 0xAA);
__m256 warp_mat_data_aligned_v21 = _mm256_blend_ps(warp_mat_data_aligned_v11, warp_mat_data_aligned_v13, 0xAA);
__m256 warp_mat_data_aligned_v22 = _mm256_blend_ps(warp_mat_data_aligned_v10, warp_mat_data_aligned_v12, 0xFF);
__m256 warp_mat_data_aligned_v23 = _mm256_blend_ps(warp_mat_data_aligned_v11, warp_mat_data_aligned_v13, 0xFF);
__m256 warp_mat_data_aligned_v24 = _mm256_blend_ps(warp_mat_data_aligned_v18, warp_mat_data_aligned_v20, 0x55);
__m256 warp_mat_data_aligned_v25 = _mm256_blend_ps(warp_mat_data_aligned_v19, warp_mat_data_aligned_v21, 0x55);
__m256 warp_mat_data_aligned_v26 = _mm256_blend_ps(warp_mat_data_aligned_v18, warp_mat_data_aligned_v20, 0xAA);
__m256 warp_mat_data_aligned_v27 = _mm256_blend_ps(warp_mat_data_aligned_v19, warp_mat_data_aligned_v21, 0xAA);
__m256 warp_mat_data_aligned_v28 = _mm256_blend_ps(warp_mat_data_aligned_v18, warp_mat_data_aligned_v20, 0xFF);
__m256 warp_mat_data_aligned_v29 = _mm256_blend_ps(warp_mat_data_aligned_v19, warp_mat_data_aligned_v21, 0xFF);
__m256 warp_mat_data_aligned_v30 = _mm256_blend_ps(warp_mat_data_aligned_v24, warp_mat_data_aligned_v26, 0x55);
__m256 warp_mat_data_aligned_v31 = _mm256_blend_ps(warp_mat_data_aligned_v25, warp_mat_data_aligned_v27, 0x55);
__m256 warp_mat_data_aligned_v32 = _mm256_blend_ps(warp_mat_data_aligned_v24, warp_mat_data_aligned_v26, 0xAA);
__m256 warp_mat_data_aligned_v33 = _mm256_blend_ps(warp_mat_data_aligned_v25, warp_mat_data_aligned_v27, 0xAA);
__m256 warp_mat_data_aligned_v34 = _mm256_blend_ps(warp_mat_data_aligned_v24, warp_mat_data_aligned_v26, 0xFF);
__m256 warp_mat_data_aligned_v35 = _mm256_blend_ps(warp_mat_data_aligned_v25, warp_mat_data_aligned_v27, 0xFF);
__m256 warp_mat_data_aligned_v36 = _mm256_blend_ps(warp_mat_data_aligned_v30, warp_mat_data_aligned_v32, 0x55);
__m256 warp_mat_data_aligned_v37 = _mm256_blend_ps(warp_mat_data_aligned_v31, warp_mat_data_aligned_v33, 0x55);
__m256 warp_mat_data_aligned_v38 = _mm256_blend_ps(warp_mat_data_aligned_v30, warp_mat_data_aligned_v32, 0xAA);
__m256 warp_mat_data_aligned_v39 = _mm256_blend_ps(warp_mat_data_aligned_v31, warp_mat_data_aligned_v33, 0xAA);
__m256 warp_mat_data_aligned_v40 = _mm256_blend_ps(warp_mat_data_aligned_v30, warp_mat_data_aligned_v32, 0xFF);
__m256 warp_mat_data_aligned_v41 = _mm256_blend_ps(warp_mat_data_aligned_v31, warp_mat_data_aligned_v33, 0xFF);
__m256 warp_mat_data_aligned_v42 = _mm256_blend_ps(warp_mat_data_aligned_v36, warp_mat_data_aligned_v38, 0x55);
__m256 warp_mat_data_aligned_v43 = _mm256_blend_ps(warp_mat_data_aligned_v37, warp_mat_data_aligned_v39, 0x55);
__m256 warp_mat_data_aligned_v44 = _mm256_blend_ps(warp_mat_data_aligned_v36, warp_mat_data_aligned_v38, 0xAA);
__m256 warp_mat_data_aligned_v45 = _mm256_blend_ps(warp_mat_data_aligned_v37, warp_mat_data_aligned_v39, 0xAA);
__m256 warp_mat_data_aligned_v46 = _mm256_blend_ps(warp_mat_data_aligned_v36, warp_mat_data_aligned_v38, 0xFF);
__m256 warp_mat_data_aligned_v47 = _mm256_blend_ps(warp_mat_data_aligned_v37, warp_mat_data_aligned_v39, 0xFF);
__m256 warp_mat_data_aligned_v48 = _mm256_blend_ps(warp_mat_data_aligned_v42, warp_mat_data_aligned_v44, 0x55);
__m256 warp_mat_data_aligned_v49 = _mm256_blend_ps(warp_mat_data_aligned_v43, warp_mat_data_aligned_v45, 0x55);
__m256 warp_mat_data_aligned_v50 = _mm256_blend_ps(warp_mat_data_aligned_v42, warp_mat_data_aligned_v44, 0xAA);
__m256 warp_mat_data_aligned_v51 = _mm256_blend_ps(warp_mat_data_aligned_v43, warp_mat_data_aligned_v45, 0xAA);
__m256 warp_mat_data_aligned_v52 = _mm256_blend_ps(warp_mat_data_aligned_v42, warp_mat_data_aligned_v44, 0xFF);
__m256 warp_mat_data_aligned_v53 = _mm256_blend_ps(warp_mat_data_aligned_v43, warp_mat_data_aligned_v45, 0xFF);
__m256 warp_mat_data_aligned_v54 = _mm256_blend_ps(warp_mat_data_aligned_v48, warp_mat_data_aligned_v50, 0x55);
__m256 warp_mat_data_aligned_v55 = _mm256_blend_ps(warp_mat_data_aligned_v49, warp_mat_data_aligned_v51, 0x55);
__m256 warp_mat_data_aligned_v56 = _mm256_blend_ps(warp_mat_data_aligned_v48, warp_mat_data_aligned_v50, 0xAA);
__m256 warp_mat_data_aligned_v57 = _mm256_blend_ps(warp_mat_data_aligned_v49, warp_mat_data_aligned_v51, 0xAA);
__m256 warp_mat_data_aligned_v58 = _mm256_blend_ps(warp_mat_data_aligned_v48, warp_mat_data_aligned_v50, 0xFF);
__m256 warp_mat_data_aligned_v59 = _mm256_blend_ps(warp_mat_data_aligned_v49, warp_mat_data_aligned_v51, 0xFF);
__m256 warp_mat_data_aligned_v60 = _mm256_blend_ps(warp_mat_data_aligned_v54, warp_mat_data_aligned_v56, 0x55);
__m256 warp_mat_data_aligned_v61 = _mm256_blend_ps(warp_mat_data_aligned_v55, warp_mat_data_aligned_v57, 0x55);
__m256 warp_mat_data_aligned_v62 = _mm256_blend_ps(warp_mat_data_aligned_v54, warp_mat_data_aligned_v56, 0xAA);
__m256 warp_mat_data_aligned_v63 = _mm256_blend_ps(warp_mat_data_aligned_v55, warp_mat_data_aligned_v57, 0xAA);
__m256 warp_mat_data_aligned_v64 = _mm256_blend_ps(warp_mat_data_aligned_v54, warp_mat_data_aligned_v56, 0xFF);
__m256 warp_mat_data_aligned_v65 = _mm256_blend_ps(warp_mat_data_aligned_v55, warp_mat_data_aligned_v57, 0xFF);
__m256 warp_mat_data_aligned_v66 = _mm256_blend_ps(warp_mat_data_aligned_v60, warp_mat_data_aligned_v62, 0x55);
__m256 warp_mat_data_aligned_v67 = _mm256_blend_ps(warp_mat_data_aligned_v61, warp_mat_data_aligned_v63, 0x55);
__m256 warp_mat_data_aligned_v68 = _mm256_blend_ps(warp_mat_data_aligned_v60, warp_mat_data_aligned_v62, 0xAA);
__m256 warp_mat_data_aligned_v69 = _mm256_blend_ps(warp_mat_data_aligned_v61, warp_mat_data_aligned_v63, 0xAA);
__m256 warp_mat_data_aligned_v70 = _mm256_blend_ps(warp_mat_data_aligned_v60, warp_mat_data_aligned_v62, 0xFF);
__m256 warp_mat_data_aligned_v71 = _mm256_blend_ps(warp_mat_data_aligned_v61, warp_mat_data_aligned_v63, 0xFF);
__m256 warp_mat_data_aligned_v72 = _mm256_blend_ps(warp_mat_data_aligned_v66, warp_mat_data_aligned_v68, 0x55);
__m256 warp_mat_data_aligned_v73 = _mm256_blend_ps(warp_mat_data_aligned_v67, warp_mat_data_aligned_v69, 0x55);
__m256 warp_mat_data_aligned_v74 = _mm256_blend_ps(warp_mat_data_aligned_v66, warp_mat_data_aligned_v68, 0xAA);
__m256 warp_mat_data_aligned_v75 = _mm256_blend_ps(warp_mat_data_aligned_v67, warp_mat_data_aligned_v69, 0xAA);
__m256 warp_mat_data_aligned_v76 = _mm256_blend_ps(warp_mat_data_aligned_v66, warp_mat_data_aligned_v68, 0xFF);
__m256 warp_mat_data_aligned_v77 = _mm256_blend_ps(warp_mat_data_aligned_v67, warp_mat_data_aligned_v69, 0xFF);
__m256 warp_mat_data_aligned_v78 = _mm256_blend_ps(warp_mat_data_aligned_v72, warp_mat_data_aligned_v74, 0x55);
__m256 warp_mat_data_aligned_v79 = _mm256_blend_ps(warp_mat_data_aligned_v73, warp_mat_data_aligned_v75, 0x55);
__m256 warp_mat_data_aligned_v80 = _mm256_blend_ps(warp_mat_data_aligned_v72, warp_mat_data_aligned_v74, 0xAA);
__m256 warp_mat_data_aligned_v81 = _mm256_blend_ps(warp_mat_data_aligned_v73, warp_mat_data_aligned_v75, 0xAA);
__m256 warp_mat_data_aligned_v82 = _mm256_blend_ps(warp_mat_data_aligned_v72, warp_mat_data_aligned_v74, 0xFF);
__m256 warp_mat_data_aligned_v83 = _mm256_blend_ps(warp_mat_data_aligned_v73, warp_mat_data_aligned_v75, 0xFF);
__m256 warp_mat_data_aligned_v84 = _mm256_blend_ps(warp_mat_data_aligned_v78, warp_mat_data_aligned_v80, 0x55);
__m256 warp_mat_data_aligned_v85 = _mm256_blend_ps(warp_mat_data_aligned_v79, warp_mat_data_aligned_v81, 0x55);
__m256 warp_mat_data_aligned
阅读全文