重写warpAffine通过SSE2指令集加速
时间: 2023-07-28 17:25:11 浏览: 189
使用SSE / SSE2进行优化
如果您想自己手动重写warpAffine函数以获得更好的SSE2加速效果,可以按照以下步骤进行:
1. 确认您的CPU支持SSE2指令集。
2. 在代码中包含以下头文件:
```c++
#include <xmmintrin.h>
#include <emmintrin.h>
```
3. 使用SSE2指令集来重写warpAffine函数。下面是一个简单的示例代码,仅供参考:
```c++
void warpAffineSSE2(cv::Mat& src, cv::Mat& dst, cv::Mat& M)
{
int srcWidth = src.cols;
int srcHeight = src.rows;
int dstWidth = dst.cols;
int dstHeight = dst.rows;
__m128d zero = _mm_setzero_pd();
__m128d one = _mm_set1_pd(1.0);
__m128d sx = _mm_set1_pd(M.at<double>(0, 0));
__m128d sy = _mm_set1_pd(M.at<double>(1, 1));
__m128d tx = _mm_set1_pd(M.at<double>(0, 2));
__m128d ty = _mm_set1_pd(M.at<double>(1, 2));
for (int y = 0; y < dstHeight; y++) {
double* dstRow = dst.ptr<double>(y);
double fy = (y + 0.5) * sy[0] - 0.5 + ty[0];
int srcY = cvFloor(fy);
__m128d fy1 = _mm_set1_pd(fy);
__m128i y0 = _mm_cvtsi32_si128(srcY);
__m128i y1 = _mm_add_epi32(y0, _mm_set1_epi32(1));
__m128d wy1 = _mm_sub_pd(fy1, _mm_cvtepi32_pd(y0));
__m128d wy0 = _mm_sub_pd(one, wy1);
for (int x = 0; x < dstWidth; x += 2) {
double* dstPixel = dstRow + x * 2;
double fx = (x + 0.5) * sx[0] - 0.5 + tx[0];
int srcX = cvFloor(fx);
__m128d fx1 = _mm_setr_pd(fx + 1.0, fx);
__m128i x0 = _mm_cvtsi32_si128(srcX);
__m128i x1 = _mm_add_epi32(x0, _mm_set1_epi32(1));
__m128d wx1 = _mm_sub_pd(fx1, _mm_cvtepi32_pd(x0));
__m128d wx0 = _mm_sub_pd(one, wx1);
__m128d v00, v01, v10, v11;
if (srcX < 0 || srcX >= srcWidth - 1 || srcY < 0 || srcY >= srcHeight - 1) {
v00 = zero;
v01 = zero;
v10 = zero;
v11 = zero;
} else {
double* srcPixel00 = src.ptr<double>(srcY) + srcX * 2;
double* srcPixel01 = srcPixel00 + 2;
double* srcPixel10 = src.ptr<double>(srcY + 1) + srcX * 2;
double* srcPixel11 = srcPixel10 + 2;
__m128d src00 = _mm_loadu_pd(srcPixel00);
__m128d src01 = _mm_loadu_pd(srcPixel01);
__m128d src10 = _mm_loadu_pd(srcPixel10);
__m128d src11 = _mm_loadu_pd(srcPixel11);
v00 = _mm_mul_pd(wx0, wy0);
v01 = _mm_mul_pd(wx1, wy0);
v10 = _mm_mul_pd(wx0, wy1);
v11 = _mm_mul_pd(wx1, wy1);
v00 = _mm_mul_pd(v00, src00);
v01 = _mm_mul_pd(v01, src01);
v10 = _mm_mul_pd(v10, src10);
v11 = _mm_mul_pd(v11, src11);
v00 = _mm_add_pd(v00, v01);
v10 = _mm_add_pd(v10, v11);
v00 = _mm_add_pd(v00, v10);
}
_mm_storeu_pd(dstPixel, v00);
}
}
}
```
这个示例代码仅仅是一个简单的实现,您可以根据自己的需求进行修改和优化。
在使用这个函数之前,您需要对输入的src和dst进行一些必要的检查,比如检查是否具有相同的通道数和深度,以及是否具有相同的尺寸等。
阅读全文