使用SIMD指令加速warpAffine带参数WARP_INVERSE_MAP例程C++
时间: 2023-09-11 11:09:22 浏览: 171
使用SIMD技术提高C++程序性能
下面是使用SIMD指令加速warpAffine带参数WARP_INVERSE_MAP例程的C++代码:
```c++
#include <opencv2/opencv.hpp>
#include <emmintrin.h>
using namespace cv;
void warpAffineInverseMap(const Mat& src, Mat& dst, const Mat& M)
{
CV_Assert(M.rows == 2 && M.cols == 3);
dst.create(src.size(), src.type());
int width = src.cols;
int height = src.rows;
int channels = src.channels();
int step = src.step;
int dst_step = dst.step;
float* src_data = (float*)src.data;
float* dst_data = (float*)dst.data;
__m128 m0 = _mm_set1_ps(M.at<double>(0, 0));
__m128 m1 = _mm_set1_ps(M.at<double>(1, 0));
__m128 m2 = _mm_set1_ps(M.at<double>(0, 1));
__m128 m3 = _mm_set1_ps(M.at<double>(1, 1));
__m128 m4 = _mm_set1_ps(M.at<double>(0, 2));
__m128 m5 = _mm_set1_ps(M.at<double>(1, 2));
for (int y = 0; y < height; ++y)
{
const float* src_row = src_data + y * step;
float* dst_row = dst_data + y * dst_step;
for (int x = 0; x < width; x += 4)
{
__m128 fx = _mm_set_ps((float)(x + 3), (float)(x + 2), (float)(x + 1), (float)x);
__m128 fy = _mm_set1_ps((float)y);
__m128 mx = _mm_add_ps(_mm_mul_ps(fx, m0), _mm_mul_ps(fy, m2));
__m128 my = _mm_add_ps(_mm_mul_ps(fx, m1), _mm_mul_ps(fy, m3));
__m128 mz = _mm_add_ps(_mm_add_ps(_mm_mul_ps(fx, m4), _mm_mul_ps(fy, m5)), _mm_set1_ps(1.0f));
__m128 rmx = _mm_div_ps(_mm_set1_ps(1.0f), mz);
__m128 fx2 = _mm_mul_ps(mx, rmx);
__m128 fy2 = _mm_mul_ps(my, rmx);
__m128i ix = _mm_cvtps_epi32(fx2);
__m128i iy = _mm_cvtps_epi32(fy2);
__m128 fx3 = _mm_sub_ps(fx2, _mm_cvtepi32_ps(ix));
__m128 fy3 = _mm_sub_ps(fy2, _mm_cvtepi32_ps(iy));
__m128 w0 = _mm_mul_ps(_mm_sub_ps(_mm_set1_ps(1.0f), fx3), _mm_sub_ps(_mm_set1_ps(1.0f), fy3));
__m128 w1 = _mm_mul_ps(fx3, _mm_sub_ps(_mm_set1_ps(1.0f), fy3));
__m128 w2 = _mm_mul_ps(_mm_sub_ps(_mm_set1_ps(1.0f), fx3), fy3);
__m128 w3 = _mm_mul_ps(fx3, fy3);
__m128i i00 = _mm_add_epi32(_mm_mullo_epi32(iy, _mm_set1_epi32(step)), _mm_mullo_epi32(ix, _mm_set1_epi32(channels)));
__m128i i10 = _mm_add_epi32(i00, _mm_set1_epi32(channels));
__m128i i01 = _mm_add_epi32(i00, _mm_set1_epi32(step));
__m128i i11 = _mm_add_epi32(i01, _mm_set1_epi32(channels));
__m128 v00 = _mm_loadu_ps(src_row + _mm_extract_epi32(i00, 0));
__m128 v10 = _mm_loadu_ps(src_row + _mm_extract_epi32(i10, 0));
__m128 v01 = _mm_loadu_ps(src_row + _mm_extract_epi32(i01, 0));
__m128 v11 = _mm_loadu_ps(src_row + _mm_extract_epi32(i11, 0));
__m128 v0 = _mm_add_ps(_mm_mul_ps(v00, w0), _mm_mul_ps(v10, w1));
__m128 v1 = _mm_add_ps(_mm_mul_ps(v01, w2), _mm_mul_ps(v11, w3));
__m128 v = _mm_add_ps(v0, v1);
_mm_storeu_ps(dst_row + x, v);
}
}
}
```
这个函数使用了SSE指令集,可以大幅提高warpAffineInverseMap函数的速度。
阅读全文