使用getRotationMatrix2D创建变换矩阵,通过SSE指令集加速warpAffine算子参数WARP_INVERSE_MAP效果的C++例程
时间: 2023-12-10 16:38:18 浏览: 42
以下是使用SSE指令集加速warpAffine算子参数WARP_INVERSE_MAP效果的C++例程:
```c++
#include <iostream>
#include <opencv2/opencv.hpp>
using namespace std;
using namespace cv;
void warpAffineInverseSSE(const Mat& src, Mat& dst, const Mat& M) {
CV_Assert(src.type() == CV_8UC1);
CV_Assert(M.type() == CV_64FC1 && M.rows == 2 && M.cols == 3);
const int width = src.cols;
const int height = src.rows;
__m128i zero = _mm_setzero_si128();
__m128i x0 = _mm_set_epi32(3, 2, 1, 0);
__m128i x1 = _mm_set_epi32(7, 6, 5, 4);
__m128i x2 = _mm_set_epi32(11, 10, 9, 8);
__m128i x3 = _mm_set_epi32(15, 14, 13, 12);
__m128d m0 = _mm_loadu_pd(M.ptr<double>(0));
__m128d m1 = _mm_loadu_pd(M.ptr<double>(1));
__m128d m2 = _mm_set_pd(0.0, 1.0);
__m128d xinc = _mm_set1_pd(0.25);
__m128d yinc = _mm_set1_pd(1.0);
__m128d xbase = _mm_set_pd(0.75, 0.25);
__m128d ybase = _mm_set1_pd(0.5);
for (int y = 0; y < height; y++) {
uchar* pdst = dst.ptr<uchar>(y);
__m128d xsrc = _mm_add_pd(_mm_mul_pd(m1, yinc), _mm_mul_pd(m2, xbase));
__m128d ysrc = _mm_add_pd(_mm_mul_pd(m0, yinc), _mm_mul_pd(m2, xbase));
for (int x = 0; x < width; x += 16) {
__m128i mask = _mm_cmpgt_epi8(_mm_set1_epi8(x + 15), zero);
__m128i xidx0 = _mm_shuffle_epi8(x0, mask);
__m128i xidx1 = _mm_shuffle_epi8(x1, mask);
__m128i xidx2 = _mm_shuffle_epi8(x2, mask);
__m128i xidx3 = _mm_shuffle_epi8(x3, mask);
__m128d xval0 = _mm_cvtepi32_pd(xidx0);
__m128d xval1 = _mm_cvtepi32_pd(xidx1);
__m128d xval2 = _mm_cvtepi32_pd(xidx2);
__m128d xval3 = _mm_cvtepi32_pd(xidx3);
__m128d xs0 = _mm_add_pd(_mm_mul_pd(_mm_sub_pd(xval0, xbase), xinc), xsrc);
__m128d xs1 = _mm_add_pd(_mm_mul_pd(_mm_sub_pd(xval1, xbase), xinc), xsrc);
__m128d xs2 = _mm_add_pd(_mm_mul_pd(_mm_sub_pd(xval2, xbase), xinc), xsrc);
__m128d xs3 = _mm_add_pd(_mm_mul_pd(_mm_sub_pd(xval3, xbase), xinc), xsrc);
__m128d ys = _mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_cvtepi32_pd(_mm_set1_epi32(x)), xbase), yinc), ysrc);
__m128i x0y0 = _mm_cvttpd_epi32(_mm_add_pd(_mm_mul_pd(xs0, yinc), ys));
__m128i x0y1 = _mm_cvttpd_epi32(_mm_add_pd(_mm_mul_pd(xs0, _mm_sub_pd(yinc, ybase)), _mm_add_pd(ys, ybase)));
__m128i x1y0 = _mm_cvttpd_epi32(_mm_add_pd(_mm_mul_pd(xs1, yinc), ys));
__m128i x1y1 = _mm_cvttpd_epi32(_mm_add_pd(_mm_mul_pd(xs1, _mm_sub_pd(yinc, ybase)), _mm_add_pd(ys, ybase)));
__m128i x2y0 = _mm_cvttpd_epi32(_mm_add_pd(_mm_mul_pd(xs2, yinc), ys));
__m128i x2y1 = _mm_cvttpd_epi32(_mm_add_pd(_mm_mul_pd(xs2, _mm_sub_pd(yinc, ybase)), _mm_add_pd(ys, ybase)));
__m128i x3y0 = _mm_cvttpd_epi32(_mm_add_pd(_mm_mul_pd(xs3, yinc), ys));
__m128i x3y1 = _mm_cvttpd_epi32(_mm_add_pd(_mm_mul_pd(xs3, _mm_sub_pd(yinc, ybase)), _mm_add_pd(ys, ybase)));
__m128i pixel0 = _mm_setr_epi8(
src.at<uchar>(y, x0y0[0]), src.at<uchar>(y, x0y0[1]), src.at<uchar>(y, x0y0[2]), src.at<uchar>(y, x0y0[3]),
src.at<uchar>(y, x0y1[0]), src.at<uchar>(y, x0y1[1]), src.at<uchar>(y, x0y1[2]), src.at<uchar>(y, x0y1[3]),
src.at<uchar>(y, x1y0[0]), src.at<uchar>(y, x1y0[1]), src.at<uchar>(y, x1y0[2]), src.at<uchar>(y, x1y0[3]),
src.at<uchar>(y, x1y1[0]), src.at<uchar>(y, x1y1[1]), src.at<uchar>(y, x1y1[2]), src.at<uchar>(y, x1y1[3])
);
__m128i pixel1 = _mm_setr_epi8(
src.at<uchar>(y, x2y0[0]), src.at<uchar>(y, x2y0[1]), src.at<uchar>(y, x2y0[2]), src.at<uchar>(y, x2y0[3]),
src.at<uchar>(y, x2y1[0]), src.at<uchar>(y, x2y1[1]), src.at<uchar>(y, x2y1[2]), src.at<uchar>(y, x2y1[3]),
src.at<uchar>(y, x3y0[0]), src.at<uchar>(y, x3y0[1]), src.at<uchar>(y, x3y0[2]), src.at<uchar>(y, x3y0[3]),
src.at<uchar>(y, x3y1[0]), src.at<uchar>(y, x3y1[1]), src.at<uchar>(y, x3y1[2]), src.at<uchar>(y, x3y1[3])
);
_mm_storeu_si128((__m128i*)pdst, _mm_or_si128(_mm_and_si128(mask, pixel0), _mm_andnot_si128(mask, pixel1)));
pdst += 16;
}
}
}
int main() {
Mat src = imread("lena.jpg", IMREAD_GRAYSCALE);
double angle = 45.0;
double scale = 1.0 / sqrt(2.0);
Point2f center(src.cols / 2.0f, src.rows / 2.0f);
Mat M = getRotationMatrix2D(center, angle, scale);
Mat dst(src.size(), CV_8UC1);
warpAffineInverseSSE(src, dst, M);
imshow("src", src);
imshow("dst", dst);
waitKey();
return 0;
}
```
在上述代码中,warpAffineInverseSSE函数使用SSE指令集加速了warpAffine算子参数WARP_INVERSE_MAP的效果。该函数首先检查输入图像src和变换矩阵M的类型和大小是否正确,然后使用SSE指令集进行图像像素的插值操作。
具体而言,该函数首先使用SSE指令集加载变换矩阵M,并根据变换矩阵计算出每个像素在源图像中的位置。然后,该函数使用SSE指令集计算出每个像素在目标图像中的位置,并进行双线性插值操作。最后,该函数使用SSE指令集将插值结果存储到目标图像中。
需要注意的是,由于SSE指令集一次只能处理16个像素,因此该函数需要在循环中对图像进行分块处理。在上述代码中,每次处理16个像素,因此需要在循环中增加一个步长为16的循环变量x。
另外,由于SSE指令集的限制,该函数只支持单通道8位无符号灰度图像。如果需要处理其他类型的图像,请根据需要进行修改。