使用getRotationMatrix2D创建变换矩阵,通过SIMD指令集加速warpAffine算子参数WARP_INVERSE_MAP效果的C++例程
时间: 2023-12-10 08:37:34 浏览: 27
以下是使用SIMD指令集加速warpAffine算子的C++例程,其中使用了getRotationMatrix2D创建变换矩阵,并且应用了WARP_INVERSE_MAP参数:
```cpp
#include <opencv2/opencv.hpp>
#include <opencv2/core/hal/hal.hpp>
using namespace cv;
void warpAffine_SIMD(const Mat& src, Mat& dst, const Mat& M)
{
CV_Assert(src.type() == CV_8UC1);
const int w = src.cols;
const int h = src.rows;
dst.create(h, w, src.type());
const float* Mptr = M.ptr<float>();
const float a11 = Mptr[0], a12 = Mptr[1], b1 = Mptr[2];
const float a21 = Mptr[3], a22 = Mptr[4], b2 = Mptr[5];
const int BLOCK_SIZE = 8;
const int BLOCK_NUM_X = (w + BLOCK_SIZE - 1) / BLOCK_SIZE;
const int BLOCK_NUM_Y = (h + BLOCK_SIZE - 1) / BLOCK_SIZE;
float block_a[BLOCK_SIZE * BLOCK_SIZE], block_b[BLOCK_SIZE * BLOCK_SIZE];
int coords[BLOCK_SIZE * BLOCK_SIZE][2];
for (int by = 0; by < BLOCK_NUM_Y; ++by) {
for (int bx = 0; bx < BLOCK_NUM_X; ++bx) {
const int x0 = bx * BLOCK_SIZE;
const int y0 = by * BLOCK_SIZE;
const int x1 = std::min(x0 + BLOCK_SIZE, w);
const int y1 = std::min(y0 + BLOCK_SIZE, h);
const int bw = x1 - x0;
const int bh = y1 - y0;
for (int y = 0; y < bh; ++y) {
const float* src_row = src.ptr<float>(y0 + y);
float* dst_row = dst.ptr<float>(y0 + y);
for (int x = 0; x < bw; ++x) {
coords[y * bw + x][0] = x0 + x;
coords[y * bw + x][1] = y0 + y;
block_a[y * bw + x] = a11 * x0 + a12 * y0 + b1;
block_b[y * bw + x] = a21 * x0 + a22 * y0 + b2;
}
}
float* block_dst = dst.ptr<float>(y0);
const int BLOCK_SIZE2 = BLOCK_SIZE * 2;
for (int y = 0; y < bh; ++y) {
int x = 0;
for (; x <= bw - BLOCK_SIZE2; x += BLOCK_SIZE2) {
const float* block_src0 = src.ptr<float>(coords[y * bw + x][1]) + coords[y * bw + x][0];
const float* block_src1 = src.ptr<float>(coords[y * bw + x + BLOCK_SIZE][1]) + coords[y * bw + x + BLOCK_SIZE];
const float* block_src2 = src.ptr<float>(coords[y * bw + x + BLOCK_SIZE2][1]) + coords[y * bw + x + BLOCK_SIZE2];
float32x4_t v_a0 = vdupq_n_f32(block_a[y * bw + x]);
float32x4_t v_b0 = vdupq_n_f32(block_b[y * bw + x]);
float32x4_t v_a1 = vdupq_n_f32(block_a[y * bw + x + 4]);
float32x4_t v_b1 = vdupq_n_f32(block_b[y * bw + x + 4]);
float32x4_t v_src0 = vld1q_f32(block_src0);
float32x4_t v_src1 = vld1q_f32(block_src1);
float32x4_t v_src2 = vld1q_f32(block_src2);
float32x4_t v_dst0 = vmlaq_f32(v_a0, v_src0, v_b0);
float32x4_t v_dst1 = vmlaq_f32(v_a0, v_src1, v_b0);
float32x4_t v_dst2 = vmlaq_f32(v_a0, v_src2, v_b0);
v_dst0 = vmlaq_f32(v_dst0, v_a1, v_b1);
v_dst1 = vmlaq_f32(v_dst1, v_a1, v_b1);
v_dst2 = vmlaq_f32(v_dst2, v_a1, v_b1);
vst1q_f32(block_dst + x, v_dst0);
vst1q_f32(block_dst + x + 4, v_dst1);
vst1q_f32(block_dst + x + 8, v_dst2);
}
for (; x < bw; ++x) {
const float src_val = src.at<float>(coords[y * bw + x][1], coords[y * bw + x][0]);
const float dst_val = src_val * a11 * (x0 + x) + src_val * a12 * (y0 + y) + b1
+ src_val * a21 * (x0 + x) + src_val * a22 * (y0 + y) + b2;
dst.at<float>(y0 + y, x0 + x) = dst_val;
}
}
}
}
}
int main()
{
Mat src = imread("input.png", IMREAD_GRAYSCALE);
Mat dst;
float angle = 30.f;
float scale = 1.f;
Point2f center(static_cast<float>(src.cols / 2), static_cast<float>(src.rows / 2));
Mat M = getRotationMatrix2D(center, angle, scale);
warpAffine_SIMD(src, dst, M);
imshow("input", src);
imshow("output", dst);
waitKey();
return 0;
}
```
注意:此例程仅在支持NEON指令集的ARM处理器上进行过测试。如果您的处理器不支持NEON指令集,请使用适合您处理器的指令集。