SIMD指令优化warpAffine带完整参数函数例程

以下是使用SIMD指令优化的warpAffine带完整参数函数例程： ```cpp #include <opencv2/opencv.hpp> #include <opencv2/core/hal/hal.hpp> #include <iostream> #include <chrono> using namespace cv; using namespace std; void warpAffine_SIMD(const Mat& src, Mat& dst, const Mat& M, const Size& dsize, int flags = INTER_LINEAR, int borderMode = BORDER_CONSTANT, const Scalar& borderValue = Scalar()) { CV_Assert(src.type() == CV_8UC1); dst.create(dsize, CV_8UC1); dst.setTo(borderValue); int x = 0, y = 0; int width = dsize.width; int height = dsize.height; int x1, y1, x2, y2, x3, y3, x4, y4; float fx, fy, fx1, fy1, fx2, fy2, fx3, fy3; const uchar* srcData = src.data; int srcStep = src.step; uchar* dstData = dst.data; int dstStep = dst.step; const float* m = reinterpret_cast<const float*>(M.data); float m0 = m[0], m1 = m[1], m2 = m[2]; float m3 = m[3], m4 = m[4], m5 = m[5]; __m128i v_zero = _mm_setzero_si128(); __m128i v_border = _mm_set1_epi8(static_cast<short>(borderValue[0])); __m128i v_mask = _mm_set1_epi8(0x80); for (y = 0; y < height; ++y) { for (x = 0; x < width; x += 16) { __m128i v_dst = _mm_load_si128(reinterpret_cast<const __m128i*>(dstData + y * dstStep + x)); __m128i v_src = v_border; x1 = static_cast<int>(m0 * x + m1 * y + m2); y1 = static_cast<int>(m3 * x + m4 * y + m5); if (x1 >= 0 && y1 >= 0 && x1 + 15 < src.cols && y1 < src.rows) { v_src = _mm_loadu_si128(reinterpret_cast<const __m128i*>(srcData + y1 * srcStep + x1)); } x2 = static_cast<int>(m0 * (x + 1) + m1 * y + m2); y2 = static_cast<int>(m3 * (x + 1) + m4 * y + m5); if (x2 >= 0 && y2 >= 0 && x2 + 15 < src.cols && y2 < src.rows) { __m128i v_src2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(srcData + y2 * srcStep + x2)); v_src = _mm_blend_epi16(v_src, v_src2, 0x01); } x3 = static_cast<int>(m0 * x + m1 * (y + 1) + m2); y3 = static_cast<int>(m3 * x + m4 * (y + 1) + m5); if (x3 >= 0 && y3 >= 0 && x3 + 15 < src.cols && y3 < src.rows) { __m128i v_src2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(srcData + y3 * srcStep + x3)); v_src = _mm_blend_epi16(v_src, v_src2, 0x02); } x4 = static_cast<int>(m0 * (x + 1) + m1 * (y + 1) + m2); y4 = static_cast<int>(m3 * (x + 1) + m4 * (y + 1) + m5); if (x4 >= 0 && y4 >= 0 && x4 + 15 < src.cols && y4 < src.rows) { __m128i v_src2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(srcData + y4 * srcStep + x4)); v_src = _mm_blend_epi16(v_src, v_src2, 0x03); } __m128i v_result = _mm_blendv_epi8(v_dst, v_src, _mm_cmpgt_epi8(v_mask, v_dst)); _mm_store_si128(reinterpret_cast<__m128i*>(dstData + y * dstStep + x), v_result); } } } int main() { Mat src = imread("test.jpg", IMREAD_GRAYSCALE); if (src.empty()) { cout << "Failed to read image" << endl; return -1; } double angle = 45.0; double scale = 1.0; Point2f center(static_cast<float>(src.cols / 2), static_cast<float>(src.rows / 2)); Mat M = getRotationMatrix2D(center, angle, scale); Mat dst; Size dsize(src.cols, src.rows); auto start = chrono::high_resolution_clock::now(); warpAffine_SIMD(src, dst, M, dsize); auto end = chrono::high_resolution_clock::now(); auto duration = chrono::duration_cast<chrono::milliseconds>(end - start); cout << "Time taken: " << duration.count() << "ms" << endl; imshow("Original Image", src); imshow("Rotated Image", dst); waitKey(0); return 0; } ``` 该函数使用了SSE2指令集，能够对warpAffine函数进行优化，从而提高计算速度。

阅读全文

SIMD指令优化warpAffine带完整参数函数例程

相关推荐

ARM M4core SIMD指令内联函数详解

SIMD指令优化TD-SCDMA系统联合检测算法

Intel SIMD指令深度解析与视频编码优化指南

SIMD指令优化warpAffine函数例程

SIMD指令优化warpAffine例程

使用SIMD指令加速warpAffine带参数WARP_INVERSE_MAP例程C++

使用SIMD指令加速warpAffine带参数WARP_INVERSE_MAP效果变换矩阵为getRotationMatrix2D例程C++

使用getRotationMatrix2D创建变换矩阵通过SIMD指令加速warpAffine带参数WARP_INVERSE_MAP效果例程C++

使用getRotationMatrix2D创建变换矩阵通过检查CPU适合的SIMD指令加速warpAffine带参数WARP_INVERSE_MAP效果例程C++

使用getRotationMatrix2D创建变换矩阵，通过SIMD指令集加速warpAffine算子参数WARP_INVERSE_MAP效果的C++例程

CPU优化warpAffine函数例程

使用getRotationMatrix2D创建变换矩阵，通过SIMD指令集加速warpAffine算子并带参数WARP_INVERSE_MAP效果的C++例程

SIMD指令优化复杂数值条件：实操与多核并行研究

Vectorize.js: 用SIMD向量优化JavaScript函数

精选毕设项目-微笑话.zip

在线教育系统-springboot毕业项目，适合计算机毕-设、实训项目、大作业学习.zip

基于智能推荐的卫生健康系统-springboot毕业项目，适合计算机毕-设、实训项目、大作业学习.zip

精选毕设项目-课程预约.zip

同步机(VSG)三相并网仿真模型 有功功率从20k突变到10k再恢复至20k 系统始终稳定运行 该仿真主要用于基础原理的学习

南京理工大学毕业论文overleaf LaTex模板，微调版

大家在看

alertmanager-0.19.0.linux-amd64.tar.gz

5G分组核心网专题.pptx

LTE Signaling & Protocol Analysis Focus: E-UTRAN and UE

r3epthook-master.zip

LITE-ON FW spec PS-2801-9L rev A01_20161118.pdf

最新推荐

aarch64 完整汇编指令集

精选毕设项目-微笑话.zip

在线教育系统-springboot毕业项目，适合计算机毕-设、实训项目、大作业学习.zip

基于智能推荐的卫生健康系统-springboot毕业项目，适合计算机毕-设、实训项目、大作业学习.zip

免安装JDK 1.8.0_241：即刻配置环境运行

管理建模和仿真的文件

【提升效率与稳定性】：深入掌握单相整流器的控制策略

你看这是ashx映射的cs文件初始代码,你看这里边根本就没有写对action参数进行任何操作但你.ashx?action=submit这样去做他就能返回出数据这是为什么

机器学习预测葡萄酒评分：二值化品尝笔记的应用

"互动学习：行动中的多样性与论文攻读经历"

同步机(VSG)三相并网仿真模型有功功率从20k突变到10k再恢复至20k 系统始终稳定运行该仿真主要用于基础原理的学习