void Extract1DEdgeCircle::GetProfieMat() { if (m_mInputMat.empty()) { return; } if (m_mInputMat.channels() > 1) { cvtColor(m_mInputMat, m_mInputMat, COLOR_BGR2GRAY); } //Get ROI mat. RotatedRect rMaskRegion(m_pdCenter, Size2f(GetPPDistance(m_pdStart, m_pdEnd) + 10, m_dLength + 10), m_dAngle); Point2f rRegionPoints[4]; rMaskRegion.points(rRegionPoints); Mat mask = Mat::zeros(m_mInputMat.size(), CV_8UC1); Point ppt[] = { rRegionPoints[0], rRegionPoints[1], rRegionPoints[2], rRegionPoints[3] }; const Point* pts[] = { ppt }; int npt[] = { 4 }; fillPoly(mask, pts, npt, 1, Scalar::all(255), 8); Mat RoiMat = Mat::zeros(m_mInputMat.size(), m_mInputMat.type()); bitwise_and(m_mInputMat, m_mInputMat, RoiMat, mask); Mat RotateMat = getRotationMatrix2D(m_pdCenter, -m_dAngle, 1); warpAffine(RoiMat, RoiMat, RotateMat, m_mInputMat.size(), WARP_INVERSE_MAP); Mat newCenter = RotateMat * (Mat_<double>(3, 1) << m_pdCenter.x, m_pdCenter.y, 1); double x = newCenter.at<double>(0, 0); double y = newCenter.at<double>(1, 0); Mat M = (Mat_<double>(2, 3) << 1, 0, x - m_dLength * 0.5, 0, 1, y - m_dHeight * 0.5); warpAffine(RoiMat, m_mInputMat, M, Size2d(m_dLength, m_dHeight), WARP_INVERSE_MAP); }这段代码如何使用AVX2指令集加速
时间: 2024-02-27 11:56:13 浏览: 203
To use AVX2 instructions to accelerate this code, we need to identify the parts of the code that can be parallelized and vectorized. One potential candidate is the image warping operations (i.e., `warpAffine` function calls).
To use AVX2 instructions, we need to use the `cv::parallel_for_` function to parallelize the loop that applies the warping operations to each pixel in the image.
Next, we need to vectorize the code inside the loop using AVX2 instructions. We can use the `cv::v_load` function to load 8 consecutive pixels (assuming a 8-byte data type) into an AVX2 register, and the `cv::v_gather` function to gather non-consecutive pixels into an AVX2 register. We can then perform the necessary arithmetic operations using AVX2 instructions and store the results back to memory using the `cv::v_store` function.
Here is an example of how the code inside the loop can be vectorized using AVX2 instructions:
```cpp
__m256i vindex = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
for (int i = 0; i < src.rows; i++)
{
uchar* src_ptr = src.ptr<uchar>(i);
uchar* dst_ptr = dst.ptr<uchar>(i);
for (int j = 0; j < src.cols; j += 8)
{
__m256i vsrc = cv::v_load(src_ptr + j);
__m256i vx = _mm256_add_epi32(_mm256_mul_epu32(_mm256_cvtepu8_epi32(vindex), vx_step), vx_offset);
__m256i vy = _mm256_add_epi32(_mm256_mul_epu32(_mm256_cvtepu8_epi32(vindex), vy_step), vy_offset);
__m256i vx_lo = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(vx, 0));
__m256i vx_hi = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(vx, 1));
__m256i vy_lo = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(vy, 0));
__m256i vy_hi = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(vy, 1));
__m256i vx_lo_32 = _mm256_cvtepi64_epi32(vx_lo);
__m256i vx_hi_32 = _mm256_cvtepi64_epi32(vx_hi);
__m256i vy_lo_32 = _mm256_cvtepi64_epi32(vy_lo);
__m256i vy_hi_32 = _mm256_cvtepi64_epi32(vy_hi);
__m256i vsrc00 = cv::v_gather(src_ptr, src_step, vx_lo_32, vy_lo_32, _mm256_setzero_si256(), 1);
__m256i vsrc01 = cv::v_gather(src_ptr, src_step, vx_hi_32, vy_lo_32, _mm256_setzero_si256(), 1);
__m256i vsrc10 = cv::v_gather(src_ptr, src_step, vx_lo_32, vy_hi_32, _mm256_setzero_si256(), 1);
__m256i vsrc11 = cv::v_gather(src_ptr, src_step, vx_hi_32, vy_hi_32, _mm256_setzero_si256(), 1);
__m256i vsrc0 = _mm256_packs_epi32(vsrc00, vsrc01);
__m256i vsrc1 = _mm256_packs_epi32(vsrc10, vsrc11);
__m256i vsrc = _mm256_packus_epi16(vsrc0, vsrc1);
cv::v_store(dst_ptr + j, vsrc);
}
}
```
Note that this is just an example, and the actual implementation may depend on the specifics of the code and the hardware platform.
阅读全文