SIMD指令优化warpAffine带完整参数函数例程
时间: 2023-11-18 10:03:13 浏览: 135
以下是使用SIMD指令优化的warpAffine带完整参数函数例程:
```cpp
#include <opencv2/opencv.hpp>
#include <opencv2/core/hal/hal.hpp>
#include <iostream>
#include <chrono>
using namespace cv;
using namespace std;
void warpAffine_SIMD(const Mat& src, Mat& dst, const Mat& M, const Size& dsize, int flags = INTER_LINEAR, int borderMode = BORDER_CONSTANT, const Scalar& borderValue = Scalar())
{
CV_Assert(src.type() == CV_8UC1);
dst.create(dsize, CV_8UC1);
dst.setTo(borderValue);
int x = 0, y = 0;
int width = dsize.width;
int height = dsize.height;
int x1, y1, x2, y2, x3, y3, x4, y4;
float fx, fy, fx1, fy1, fx2, fy2, fx3, fy3;
const uchar* srcData = src.data;
int srcStep = src.step;
uchar* dstData = dst.data;
int dstStep = dst.step;
const float* m = reinterpret_cast<const float*>(M.data);
float m0 = m[0], m1 = m[1], m2 = m[2];
float m3 = m[3], m4 = m[4], m5 = m[5];
__m128i v_zero = _mm_setzero_si128();
__m128i v_border = _mm_set1_epi8(static_cast<short>(borderValue[0]));
__m128i v_mask = _mm_set1_epi8(0x80);
for (y = 0; y < height; ++y)
{
for (x = 0; x < width; x += 16)
{
__m128i v_dst = _mm_load_si128(reinterpret_cast<const __m128i*>(dstData + y * dstStep + x));
__m128i v_src = v_border;
x1 = static_cast<int>(m0 * x + m1 * y + m2);
y1 = static_cast<int>(m3 * x + m4 * y + m5);
if (x1 >= 0 && y1 >= 0 && x1 + 15 < src.cols && y1 < src.rows)
{
v_src = _mm_loadu_si128(reinterpret_cast<const __m128i*>(srcData + y1 * srcStep + x1));
}
x2 = static_cast<int>(m0 * (x + 1) + m1 * y + m2);
y2 = static_cast<int>(m3 * (x + 1) + m4 * y + m5);
if (x2 >= 0 && y2 >= 0 && x2 + 15 < src.cols && y2 < src.rows)
{
__m128i v_src2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(srcData + y2 * srcStep + x2));
v_src = _mm_blend_epi16(v_src, v_src2, 0x01);
}
x3 = static_cast<int>(m0 * x + m1 * (y + 1) + m2);
y3 = static_cast<int>(m3 * x + m4 * (y + 1) + m5);
if (x3 >= 0 && y3 >= 0 && x3 + 15 < src.cols && y3 < src.rows)
{
__m128i v_src2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(srcData + y3 * srcStep + x3));
v_src = _mm_blend_epi16(v_src, v_src2, 0x02);
}
x4 = static_cast<int>(m0 * (x + 1) + m1 * (y + 1) + m2);
y4 = static_cast<int>(m3 * (x + 1) + m4 * (y + 1) + m5);
if (x4 >= 0 && y4 >= 0 && x4 + 15 < src.cols && y4 < src.rows)
{
__m128i v_src2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(srcData + y4 * srcStep + x4));
v_src = _mm_blend_epi16(v_src, v_src2, 0x03);
}
__m128i v_result = _mm_blendv_epi8(v_dst, v_src, _mm_cmpgt_epi8(v_mask, v_dst));
_mm_store_si128(reinterpret_cast<__m128i*>(dstData + y * dstStep + x), v_result);
}
}
}
int main()
{
Mat src = imread("test.jpg", IMREAD_GRAYSCALE);
if (src.empty())
{
cout << "Failed to read image" << endl;
return -1;
}
double angle = 45.0;
double scale = 1.0;
Point2f center(static_cast<float>(src.cols / 2), static_cast<float>(src.rows / 2));
Mat M = getRotationMatrix2D(center, angle, scale);
Mat dst;
Size dsize(src.cols, src.rows);
auto start = chrono::high_resolution_clock::now();
warpAffine_SIMD(src, dst, M, dsize);
auto end = chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::milliseconds>(end - start);
cout << "Time taken: " << duration.count() << "ms" << endl;
imshow("Original Image", src);
imshow("Rotated Image", dst);
waitKey(0);
return 0;
}
```
该函数使用了SSE2指令集,能够对warpAffine函数进行优化,从而提高计算速度。
阅读全文