for (int i = 0; i < grid_y; i++) { for (int j = 0; j < grid_x; j++) { float box_score = sigmoid_x(pdata[4]); ;//获取每一行的box框中含有某个物体的概率 if (box_score >= boxThreshold) { cv::Mat scores(1, className.size(), CV_32FC1, pdata + 5); Point classIdPoint; double max_class_socre; minMaxLoc(scores, 0, &max_class_socre, 0, &classIdPoint); max_class_socre = sigmoid_x(max_class_socre); if (max_class_socre >= classThreshold) { float x = (sigmoid_x(pdata[0]) * 2.f - 0.5f + j) * netStride[stride]; //x float y = (sigmoid_x(pdata[1]) * 2.f - 0.5f + i) * netStride[stride]; //y float w = powf(sigmoid_x(pdata[2]) * 2.f, 2.f) * anchor_w; //w float h = powf(sigmoid_x(pdata[3]) * 2.f, 2.f) * anchor_h; //h int left = (int)(x - 0.5 * w) * ratio_w + 0.5; int top = (int)(y - 0.5 * h) * ratio_h + 0.5; classIds.push_back(classIdPoint.x); confidences.push_back(max_class_socre * box_score); boxes.push_back(Rect(left, top, int(w * ratio_w), int(h * ratio_h))); } pdata += net_width;//下一行 } } }这段如何用neon优化,可以写段示例吗
时间: 2024-02-21 18:57:41 浏览: 109
可以尝试使用NEON指令集来优化该段代码。NEON是ARM体系结构的一种SIMD指令集,可以实现对向量运算的加速。
以下是一种可能的NEON优化示例:
```
#include <arm_neon.h>
// ...
for (int i = 0; i < grid_y; i++) {
for (int j = 0; j < grid_x; j++) {
// Load data into NEON registers
float32x4_t pdata_neon = vld1q_f32(pdata);
float32x4_t scores_neon = vld1q_f32(pdata + 5);
// Compute sigmoid_x for pdata_neon
pdata_neon = vnegq_f32(pdata_neon);
pdata_neon = vexpq_f32(pdata_neon);
pdata_neon = vaddq_f32(pdata_neon, vdupq_n_f32(1));
pdata_neon = vrecpeq_f32(pdata_neon);
pdata_neon = vmulq_f32(pdata_neon, vdupq_n_f32(2));
pdata_neon = vsubq_f32(pdata_neon, vdupq_n_f32(0.5f));
// Compute box_score using the first element of pdata_neon
float32x2_t box_score_neon = vget_low_f32(pdata_neon);
box_score_neon = vsigmoid_x_f32(box_score_neon);
box_score_neon = vmul_f32(box_score_neon, vdup_n_f32(1.0f / 255));
float box_score = vget_lane_f32(box_score_neon, 0);
if (box_score >= boxThreshold) {
// Compute max_class_socre and classIdPoint using scores_neon
float32x2_t max_class_socre_neon;
int32x2_t classIdPoint_neon;
vMinMaxLoc(scores_neon, &max_class_socre_neon, NULL, &classIdPoint_neon, NULL);
max_class_socre_neon = vsigmoid_x_f32(max_class_socre_neon);
max_class_socre_neon = vmul_f32(max_class_socre_neon, vdup_n_f32(1.0f / 255));
double max_class_socre = vgetq_lane_f32(max_class_socre_neon, 0);
Point classIdPoint(classIdPoint_neon[0], 0);
if (max_class_socre >= classThreshold) {
// Compute x, y, w, h using pdata_neon
float32x2_t xy_neon = vget_low_f32(pdata_neon);
float32x2_t wh_neon = vget_high_f32(pdata_neon);
xy_neon = vsigmoid_x_f32(xy_neon);
wh_neon = vexp_f32(wh_neon);
wh_neon = vmul_f32(wh_neon, vdup_n_f32(2));
wh_neon = vmul_f32(wh_neon, vdup_n_f32(anchor_w));
xy_neon = vsub_f32(xy_neon, vdup_n_f32(0.5f));
xy_neon = vadd_f32(xy_neon, vdup_n_f32(j));
xy_neon = vmul_f32(xy_neon, vdup_n_f32(netStride[stride]));
xy_neon = vmul_f32(xy_neon, vdup_n_f32(ratio_w));
wh_neon = vmul_f32(wh_neon, vdup_n_f32(anchor_h));
wh_neon = vmul_f32(wh_neon, vdup_n_f32(ratio_h));
float x = vget_lane_f32(xy_neon, 0);
float y = vget_lane_f32(xy_neon, 1);
float w = vget_lane_f32(wh_neon, 0);
float h = vget_lane_f32(wh_neon, 1);
int left = (int)(x - 0.5 * w) + 0.5;
int top = (int)(y - 0.5 * h) + 0.5;
classIds.push_back(classIdPoint.x);
confidences.push_back(max_class_socre * box_score);
boxes.push_back(Rect(left, top, int(w), int(h)));
}
}
pdata += 4 * net_width;
}
}
```
需要注意的是,NEON指令集的优化效果取决于处理的数据类型和数据大小。上述示例中只使用了NEON的部分指令,具体的优化方案需要根据实际情况进行调整和优化。
阅读全文