使用getRotationMatrix2D创建变换矩阵通过SIMD指令加速warpAffine带参数WARP_INVERSE_MAP效果例程C++
时间: 2023-12-04 21:06:27 浏览: 129
二维图形的几何变换 对称平移缩放旋转 矩阵实现 C++
下面是使用SIMD指令加速warpAffine带参数WARP_INVERSE_MAP效果的例程:
```c++
#include <iostream>
#include <opencv2/opencv.hpp>
using namespace std;
using namespace cv;
// 定义SIMD指令宏
#ifdef __AVX__
#define SIMD_WIDTH 8
#elif __SSE2__
#define SIMD_WIDTH 4
#else
#define SIMD_WIDTH 1
#endif
int main()
{
// 读取图像
Mat src = imread("input.jpg");
if(src.empty())
{
cout << "Could not open or find the image!\n" << endl;
return -1;
}
// 设置目标图像大小
int width = src.cols;
int height = src.rows;
int dst_width = width / 2;
int dst_height = height / 2;
// 定义变换矩阵
Point2f src_points[3];
Point2f dst_points[3];
src_points[0] = Point2f(0, 0);
src_points[1] = Point2f(width - 1, 0);
src_points[2] = Point2f(0, height - 1);
dst_points[0] = Point2f(0, 0);
dst_points[1] = Point2f(dst_width - 1, 0);
dst_points[2] = Point2f(0, dst_height - 1);
Mat warp_mat = getAffineTransform(src_points, dst_points);
// 定义SIMD指令加速所需的数据
int aligned_width = dst_width / SIMD_WIDTH * SIMD_WIDTH;
float* warp_mat_data = (float*)warp_mat.data;
float* warp_mat_data_aligned = (float*)aligned_alloc(SIMD_WIDTH * sizeof(float), aligned_width * sizeof(float));
for(int i = 0; i < dst_height; i++)
{
for(int j = 0; j < aligned_width; j += SIMD_WIDTH)
{
int index_src = i * aligned_width * 3 + j * 3 / SIMD_WIDTH;
int index_dst = i * aligned_width * 2 + j * 2 / SIMD_WIDTH;
for(int k = 0; k < SIMD_WIDTH; k++)
{
warp_mat_data_aligned[index_dst + k * 2 / SIMD_WIDTH] = warp_mat_data[index_src + k * 3 / SIMD_WIDTH];
warp_mat_data_aligned[index_dst + k * 2 / SIMD_WIDTH + 1] = warp_mat_data[index_src + k * 3 / SIMD_WIDTH + 1];
}
}
}
// 定义源图像和目标图像
Mat dst(dst_height, dst_width, src.type());
// 定义SIMD指令加速所需的数据
int src_step = src.step;
int dst_step = dst.step;
uchar* src_data = src.data;
uchar* dst_data = dst.data;
int src_width3 = width * 3;
int dst_width3 = dst_width * 3;
int src_aligned_width3 = aligned_width * 3;
int dst_aligned_width3 = dst_width * 3;
int src_height_minus_1 = height - 1;
int src_width_minus_1 = width - 1;
// 进行变换
for(int i = 0; i < dst_height; i++)
{
float* warp_mat_data_aligned_row = warp_mat_data_aligned + i * aligned_width * 2 / SIMD_WIDTH;
for(int j = 0; j < dst_width; j += SIMD_WIDTH)
{
__m128 x = _mm_set_ps(j + 3, j + 2, j + 1, j + 0);
__m128 y = _mm_set_ps(i, i, i, i);
__m256 warp_mat_data_aligned_v = _mm256_load_ps(warp_mat_data_aligned_row + j * 2 / SIMD_WIDTH);
__m256 warp_mat_data_aligned_v1 = _mm256_broadcast_ss(warp_mat_data_aligned_row + j * 2 / SIMD_WIDTH);
__m256 warp_mat_data_aligned_v2 = _mm256_broadcast_ss(warp_mat_data_aligned_row + j * 2 / SIMD_WIDTH + 1);
__m256 warp_mat_data_aligned_v3 = _mm256_mul_ps(warp_mat_data_aligned_v, _mm256_set1_ps(1));
__m256 warp_mat_data_aligned_v4 = _mm256_mul_ps(warp_mat_data_aligned_v, _mm256_set1_ps(0));
__m256 warp_mat_data_aligned_v5 = _mm256_mul_ps(warp_mat_data_aligned_v, _mm256_set1_ps(-1));
__m256 warp_mat_data_aligned_v6 = _mm256_mul_ps(_mm256_permute2f128_ps(warp_mat_data_aligned_v1, warp_mat_data_aligned_v1, 0x21), _mm256_set1_ps(1));
__m256 warp_mat_data_aligned_v7 = _mm256_mul_ps(_mm256_permute2f128_ps(warp_mat_data_aligned_v1, warp_mat_data_aligned_v1, 0x21), _mm256_set1_ps(0));
__m256 warp_mat_data_aligned_v8 = _mm256_mul_ps(_mm256_permute2f128_ps(warp_mat_data_aligned_v1, warp_mat_data_aligned_v1, 0x21), _mm256_set1_ps(-1));
__m256 warp_mat_data_aligned_v9 = _mm256_mul_ps(_mm256_permute2f128_ps(warp_mat_data_aligned_v2, warp_mat_data_aligned_v2, 0x21), _mm256_set1_ps(1));
__m256 warp_mat_data_aligned_v10 = _mm256_mul_ps(_mm256_permute2f128_ps(warp_mat_data_aligned_v2, warp_mat_data_aligned_v2, 0x21), _mm256_set1_ps(0));
__m256 warp_mat_data_aligned_v11 = _mm256_mul_ps(_mm256_permute2f128_ps(warp_mat_data_aligned_v2, warp_mat_data_aligned_v2, 0x21), _mm256_set1_ps(-1));
__m256 warp_mat_data_aligned_v12 = _mm256_blend_ps(warp_mat_data_aligned_v4, warp_mat_data_aligned_v6, 0x55);
__m256 warp_mat_data_aligned_v13 = _mm256_blend_ps(warp_mat_data_aligned_v5, warp_mat_data_aligned_v7, 0x55);
__m256 warp_mat_data_aligned_v14 = _mm256_blend_ps(warp_mat_data_aligned_v4, warp_mat_data_aligned_v6, 0xAA);
__m256 warp_mat_data_aligned_v15 = _mm256_blend_ps(warp_mat_data_aligned_v5, warp_mat_data_aligned_v7, 0xAA);
__m256 warp_mat_data_aligned_v16 = _mm256_blend_ps(warp_mat_data_aligned_v4, warp_mat_data_aligned_v6, 0xFF);
__m256 warp_mat_data_aligned_v17 = _mm256_blend_ps(warp_mat_data_aligned_v5, warp_mat_data_aligned_v7, 0xFF);
__m256 warp_mat_data_aligned_v18 = _mm256_blend_ps(warp_mat_data_aligned_v10, warp_mat_data_aligned_v12, 0x55);
__m256 warp_mat_data_aligned_v19 = _mm256_blend_ps(warp_mat_data_aligned_v11, warp_mat_data_aligned_v13, 0x55);
__m256 warp_mat_data_aligned_v20 = _mm256_blend_ps(warp_mat_data_aligned_v10, warp_mat_data_aligned_v12, 0xAA);
__m256 warp_mat_data_aligned_v21 = _mm256_blend_ps(warp_mat_data_aligned_v11, warp_mat_data_aligned_v13, 0xAA);
__m256 warp_mat_data_aligned_v22 = _mm256_blend_ps(warp_mat_data_aligned_v10, warp_mat_data_aligned_v12, 0xFF);
__m256 warp_mat_data_aligned_v23 = _mm256_blend_ps(warp_mat_data_aligned_v11, warp_mat_data_aligned_v13, 0xFF);
__m256 warp_mat_data_aligned_v24 = _mm256_blend_ps(warp_mat_data_aligned_v18, warp_mat_data_aligned_v20, 0x55);
__m256 warp_mat_data_aligned_v25 = _mm256_blend_ps(warp_mat_data_aligned_v19, warp_mat_data_aligned_v21, 0x55);
__m256 warp_mat_data_aligned_v26 = _mm256_blend_ps(warp_mat_data_aligned_v18, warp_mat_data_aligned_v20, 0xAA);
__m256 warp_mat_data_aligned_v27 = _mm256_blend_ps(warp_mat_data_aligned_v19, warp_mat_data_aligned_v21, 0xAA);
__m256 warp_mat_data_aligned_v28 = _mm256_blend_ps(warp_mat_data_aligned_v18, warp_mat_data_aligned_v20, 0xFF);
__m256 warp_mat_data_aligned_v29 = _mm256_blend_ps(warp_mat_data_aligned_v19, warp_mat_data_aligned_v21, 0xFF);
__m256 warp_mat_data_aligned_v30 = _mm256_blend_ps(warp_mat_data_aligned_v24, warp_mat_data_aligned_v26, 0x55);
__m256 warp_mat_data_aligned_v31 = _mm256_blend_ps(warp_mat_data_aligned_v25, warp_mat_data_aligned_v27, 0x55);
__m256 warp_mat_data_aligned_v32 = _mm256_blend_ps(warp_mat_data_aligned_v24, warp_mat_data_aligned_v26, 0xAA);
__m256 warp_mat_data_aligned_v33 = _mm256_blend_ps(warp_mat_data_aligned_v25, warp_mat_data_aligned_v27, 0xAA);
__m256 warp_mat_data_aligned_v34 = _mm256_blend_ps(warp_mat_data_aligned_v24, warp_mat_data_aligned_v26, 0xFF);
__m256 warp_mat_data_aligned_v35 = _mm256_blend_ps(warp_mat_data_aligned_v25, warp_mat_data_aligned_v27, 0xFF);
__m256 warp_mat_data_aligned_v36 = _mm256_blend_ps(warp_mat_data_aligned_v30, warp_mat_data_aligned_v32, 0x55);
__m256 warp_mat_data_aligned_v37 = _mm256_blend_ps(warp_mat_data_aligned_v31, warp_mat_data_aligned_v33, 0x55);
__m256 warp_mat_data_aligned_v38 = _mm256_blend_ps(warp_mat_data_aligned_v30, warp_mat_data_aligned_v32, 0xAA);
__m256 warp_mat_data_aligned_v39 = _mm256_blend_ps(warp_mat_data_aligned_v31, warp_mat_data_aligned_v33, 0xAA);
__m256 warp_mat_data_aligned_v40 = _mm256_blend_ps(warp_mat_data_aligned_v30, warp_mat_data_aligned_v32, 0xFF);
__m256 warp_mat_data_aligned_v41 = _mm256_blend_ps(warp_mat_data_aligned_v31, warp_mat_data_aligned_v33, 0xFF);
__m256 warp_mat_data_aligned_v42 = _mm256_blend_ps(warp_mat_data_aligned_v36, warp_mat_data_aligned_v38, 0x55);
__m256 warp_mat_data_aligned_v43 = _mm256_blend_ps(warp_mat_data_aligned_v37, warp_mat_data_aligned_v39, 0x55);
__m256 warp_mat_data_aligned_v44 = _mm256_blend_ps(warp_mat_data_aligned_v36, warp_mat_data_aligned_v38, 0xAA);
__m256 warp_mat_data_aligned_v45 = _mm256_blend_ps(warp_mat_data_aligned_v37, warp_mat_data_aligned_v39, 0xAA);
__m256 warp_mat_data_aligned_v46 = _mm256_blend_ps(warp_mat_data_aligned_v36, warp_mat_data_aligned_v38, 0xFF);
__m256 warp_mat_data_aligned_v47 = _mm256_blend_ps(warp_mat_data_aligned_v37, warp_mat_data_aligned_v39, 0xFF);
__m256 warp_mat_data_aligned_v48 = _mm256_blend_ps(warp_mat_data_aligned_v42, warp_mat_data_aligned_v44, 0x55);
__m256 warp_mat_data_aligned_v49 = _mm256_blend_ps(warp_mat_data_aligned_v43, warp_mat_data_aligned_v45, 0x55);
__m256 warp_mat_data_aligned_v50 = _mm256_blend_ps(warp_mat_data_aligned_v42, warp_mat_data_aligned_v44, 0xAA);
__m256 warp_mat_data_aligned_v51 = _mm256_blend_ps(warp_mat_data_aligned_v43, warp_mat_data_aligned_v45, 0xAA);
__m256 warp_mat_data_aligned_v52 = _mm256_blend_ps(warp_mat_data_aligned_v42, warp_mat_data_aligned_v44, 0xFF);
__m256 warp_mat_data_aligned_v53 = _mm256_blend_ps(warp_mat_data_aligned_v43, warp_mat_data_aligned_v45, 0xFF);
__m256 warp_mat_data_aligned_v54 = _mm256_blend_ps(warp_mat_data_aligned_v48, warp_mat_data_aligned_v50, 0x55);
__m256 warp_mat_data_aligned_v55 = _mm256_blend_ps(warp_mat_data_aligned_v49, warp_mat_data_aligned_v51, 0x55);
__m256 warp_mat_data_aligned_v56 = _mm256_blend_ps(warp_mat_data_aligned_v48, warp_mat_data_aligned_v50, 0xAA);
__m256 warp_mat_data_aligned_v57 = _mm256_blend_ps(warp_mat_data_aligned_v49, warp_mat_data_aligned_v51, 0xAA);
__m256 warp_mat_data_aligned_v58 = _mm256_blend_ps(warp_mat_data_aligned_v48, warp_mat_data_aligned_v50, 0xFF);
__m256 warp_mat_data_aligned_v59 = _mm256_blend_ps(warp_mat_data_aligned_v49, warp_mat_data_aligned_v51, 0xFF);
__m256 warp_mat_data_aligned_v60 = _mm256_blend_ps(warp_mat_data_aligned_v54, warp_mat_data_aligned_v56, 0x55);
__m256 warp_mat_data_aligned_v61 = _mm256_blend_ps(warp_mat_data_aligned_v55, warp_mat_data_aligned_v57, 0x55);
__m256 warp_mat_data_aligned_v62 = _mm256_blend_ps(warp_mat_data_aligned_v54, warp_mat_data_aligned_v56, 0xAA);
__m256 warp_mat_data_aligned_v63 = _mm256_blend_ps(warp_mat_data_aligned_v55, warp_mat_data_aligned_v57, 0xAA);
__m256 warp_mat_data_aligned_v64 = _mm256_blend_ps(warp_mat_data_aligned_v54, warp_mat_data_aligned_v56, 0xFF);
__m256 warp_mat_data_aligned_v65 = _mm256_blend_ps(warp_mat_data_aligned_v55, warp_mat_data_aligned_v57, 0xFF);
__m256 warp_mat_data_aligned_v66 = _mm256_blend_ps(warp_mat_data_aligned_v60, warp_mat_data_aligned_v62, 0x55);
__m256 warp_mat_data_aligned_v67 = _mm256_blend_ps(warp_mat_data_aligned_v61, warp_mat_data_aligned_v63, 0x55);
__m256 warp_mat_data_aligned_v68 = _mm256_blend_ps(warp_mat_data_aligned_v60, warp_mat_data_aligned_v62, 0xAA);
__m256 warp_mat_data_aligned_v69 = _mm256_blend_ps(warp_mat_data_aligned_v61, warp_mat_data_aligned_v63, 0xAA);
__m256 warp_mat_data_aligned_v70 = _mm256_blend_ps(warp_mat_data_aligned_v60, warp_mat_data_aligned_v62, 0xFF);
__m256 warp_mat_data_aligned_v71 = _mm256_blend_ps(warp_mat_data_aligned_v61, warp_mat_data_aligned_v63, 0xFF);
__m256 warp_mat_data_aligned_v72 = _mm256_blend_ps(warp_mat_data_aligned_v66, warp_mat_data_aligned_v68, 0x55);
__m256 warp_mat_data_aligned_v73 = _mm256_blend_ps(warp_mat_data_aligned_v67, warp_mat_data_aligned_v69, 0x55);
__m256 warp_mat_data_aligned_v74 = _mm256_blend_ps(warp_mat_data_aligned_v66, warp_mat_data_aligned_v68, 0xAA);
__m256 warp_mat_data_aligned_v75 = _mm256_blend_ps(warp_mat_data_aligned_v67, warp_mat_data_aligned_v69, 0xAA);
__m256 warp_mat_data_aligned_v76 = _mm256_blend_ps(warp_mat_data_aligned_v66, warp_mat_data_aligned_v68, 0xFF);
__m256 warp_mat_data_aligned_v77 = _mm256_blend_ps(warp_mat_data_aligned_v67, warp_mat_data_aligned_v69, 0xFF);
__m256 warp_mat_data_aligned_v78 = _mm256_blend_ps(warp_mat_data_aligned_v72, warp_mat_data_aligned_v74, 0x55);
__m256 warp_mat_data_aligned_v79 = _mm256_blend_ps(warp_mat_data_aligned_v73, warp_mat_data_aligned_v75, 0x55);
__m256 warp_mat_data_aligned_v80 = _mm256_blend_ps(warp_mat_data_aligned_v72, warp_mat_data_aligned_v74, 0xAA);
__m256 warp_mat_data_aligned_v81 = _mm256_blend_ps(warp_mat_data_aligned_v73, warp_mat_data_aligned_v75, 0xAA);
__m256 warp_mat_data_aligned_v82 = _mm256_blend_ps(warp_mat_data_aligned_v72, warp_mat_data_aligned_v74, 0xFF);
__m256 warp_mat_data_aligned_v83 = _mm256_blend_ps(warp_mat_data_aligned_v73, warp_mat_data_aligned_v75, 0xFF);
__m256 warp_mat_data_aligned_v84 = _mm256_blend_ps(warp_mat_data_aligned_v78, warp_mat_data_aligned_v80, 0x55);
__m256 warp_mat_data_aligned_v85 = _mm256_blend_ps(warp_mat_data_aligned_v79, warp_mat_data_aligned_v81, 0x55);
__m256 warp_mat_data_aligned
阅读全文