for (int i = 0; i < grid_y; i++) { for (int j = 0; j < grid_x; j++) { float box_score = sigmoid_x(pdata[4]); ;//获取每一行的box框中含有某个物体的概率 if (box_score >= boxThreshold) { cv::Mat scores(1, className.size(), CV_32FC1, pdata + 5); Point classIdPoint; double max_class_socre; minMaxLoc(scores, 0, &max_class_socre, 0, &classIdPoint); max_class_socre = sigmoid_x(max_class_socre); if (max_class_socre >= classThreshold) { float x = (sigmoid_x(pdata[0]) * 2.f - 0.5f + j) * netStride[stride]; //x float y = (sigmoid_x(pdata[1]) * 2.f - 0.5f + i) * netStride[stride]; //y float w = powf(sigmoid_x(pdata[2]) * 2.f, 2.f) * anchor_w; //w float h = powf(sigmoid_x(pdata[3]) * 2.f, 2.f) * anchor_h; //h int left = (int)(x - 0.5 * w) * ratio_w + 0.5; int top = (int)(y - 0.5 * h) * ratio_h + 0.5; classIds.push_back(classIdPoint.x); confidences.push_back(max_class_socre * box_score); boxes.push_back(Rect(left, top, int(w * ratio_w), int(h * ratio_h))); } pdata += net_width;//下一行 } } }这段如何用neon优化,可以写段示例吗

时间: 2024-02-21 20:57:41 浏览: 19
可以尝试使用NEON指令集来优化该段代码。NEON是ARM体系结构的一种SIMD指令集,可以实现对向量运算的加速。 以下是一种可能的NEON优化示例: ``` #include <arm_neon.h> // ... for (int i = 0; i < grid_y; i++) { for (int j = 0; j < grid_x; j++) { // Load data into NEON registers float32x4_t pdata_neon = vld1q_f32(pdata); float32x4_t scores_neon = vld1q_f32(pdata + 5); // Compute sigmoid_x for pdata_neon pdata_neon = vnegq_f32(pdata_neon); pdata_neon = vexpq_f32(pdata_neon); pdata_neon = vaddq_f32(pdata_neon, vdupq_n_f32(1)); pdata_neon = vrecpeq_f32(pdata_neon); pdata_neon = vmulq_f32(pdata_neon, vdupq_n_f32(2)); pdata_neon = vsubq_f32(pdata_neon, vdupq_n_f32(0.5f)); // Compute box_score using the first element of pdata_neon float32x2_t box_score_neon = vget_low_f32(pdata_neon); box_score_neon = vsigmoid_x_f32(box_score_neon); box_score_neon = vmul_f32(box_score_neon, vdup_n_f32(1.0f / 255)); float box_score = vget_lane_f32(box_score_neon, 0); if (box_score >= boxThreshold) { // Compute max_class_socre and classIdPoint using scores_neon float32x2_t max_class_socre_neon; int32x2_t classIdPoint_neon; vMinMaxLoc(scores_neon, &max_class_socre_neon, NULL, &classIdPoint_neon, NULL); max_class_socre_neon = vsigmoid_x_f32(max_class_socre_neon); max_class_socre_neon = vmul_f32(max_class_socre_neon, vdup_n_f32(1.0f / 255)); double max_class_socre = vgetq_lane_f32(max_class_socre_neon, 0); Point classIdPoint(classIdPoint_neon[0], 0); if (max_class_socre >= classThreshold) { // Compute x, y, w, h using pdata_neon float32x2_t xy_neon = vget_low_f32(pdata_neon); float32x2_t wh_neon = vget_high_f32(pdata_neon); xy_neon = vsigmoid_x_f32(xy_neon); wh_neon = vexp_f32(wh_neon); wh_neon = vmul_f32(wh_neon, vdup_n_f32(2)); wh_neon = vmul_f32(wh_neon, vdup_n_f32(anchor_w)); xy_neon = vsub_f32(xy_neon, vdup_n_f32(0.5f)); xy_neon = vadd_f32(xy_neon, vdup_n_f32(j)); xy_neon = vmul_f32(xy_neon, vdup_n_f32(netStride[stride])); xy_neon = vmul_f32(xy_neon, vdup_n_f32(ratio_w)); wh_neon = vmul_f32(wh_neon, vdup_n_f32(anchor_h)); wh_neon = vmul_f32(wh_neon, vdup_n_f32(ratio_h)); float x = vget_lane_f32(xy_neon, 0); float y = vget_lane_f32(xy_neon, 1); float w = vget_lane_f32(wh_neon, 0); float h = vget_lane_f32(wh_neon, 1); int left = (int)(x - 0.5 * w) + 0.5; int top = (int)(y - 0.5 * h) + 0.5; classIds.push_back(classIdPoint.x); confidences.push_back(max_class_socre * box_score); boxes.push_back(Rect(left, top, int(w), int(h))); } } pdata += 4 * net_width; } } ``` 需要注意的是,NEON指令集的优化效果取决于处理的数据类型和数据大小。上述示例中只使用了NEON的部分指令,具体的优化方案需要根据实际情况进行调整和优化。

相关推荐

请你解析下列代码#include <iostream>#include <vector>#include <cstdlib>#include <ctime>#include <chrono>#include <thread>class Grid {public: Grid(int width, int height) : width_(width), height_(height) { grid_.resize(width_ * height_); for (int i = 0; i < grid_.size(); ++i) { grid_[i] = rand() % 2; } } void update() { std::vector<int> new_grid(grid_.size()); for (int i = 0; i < height_; ++i) { for (int j = 0; j < width_; ++j) { int count = live_neighbors(j, i); int index = i * width_ + j; if (count == 3 || (count == 2 && grid_[index])) { new_grid[index] = 1; } else { new_grid[index] = 0; } } } grid_ = new_grid; } void print() { for (int i = 0; i < height_; ++i) { for (int j = 0; j < width_; ++j) { int index = i * width_ + j; if (grid_[index]) { std::cout << "#"; } else { std::cout << " "; } } std::cout << std::endl; } }private: int live_neighbors(int x, int y) { int count = 0; for (int j = -1; j <= 1; ++j) { for (int i = -1; i <= 1; ++i) { int col = (x + i + width_) % width_; int row = (y + j + height_) % height_; int index = row * width_ + col; count += grid_[index]; } } count -= grid_[y * width_ + x]; return count; } int width_; int height_; std::vector<int> grid_;};int main() { srand(time(nullptr)); int width, height; std::cout << "Enter grid width: "; std::cin >> width; std::cout << "Enter grid height: "; std::cin >> height; Grid grid(width, height); while (true) { grid.print(); std::this_thread::sleep_for(std::chrono::milliseconds(500)); grid.update(); } return 0;}

解释代码 static int process(int8_t* input, int* anchor, int grid_h, int grid_w, int height, int width, int stride, std::vector<float>& boxes, std::vector<float>& objProbs, std::vector<int>& classId, float threshold, int32_t zp, float scale) { int validCount = 0; int grid_len = grid_h * grid_w; float thres = unsigmoid(threshold); int8_t thres_i8 = qnt_f32_to_affine(thres, zp, scale); for (int a = 0; a < 3; a++) { for (int i = 0; i < grid_h; i++) { for (int j = 0; j < grid_w; j++) { int8_t box_confidence = input[(PROP_BOX_SIZE * a + 4) * grid_len + i * grid_w + j]; if (box_confidence >= thres_i8) { int offset = (PROP_BOX_SIZE * a) * grid_len + i * grid_w + j; int8_t* in_ptr = input + offset; float box_x = sigmoid(deqnt_affine_to_f32(*in_ptr, zp, scale)) * 2.0 - 0.5; float box_y = sigmoid(deqnt_affine_to_f32(in_ptr[grid_len], zp, scale)) * 2.0 - 0.5; float box_w = sigmoid(deqnt_affine_to_f32(in_ptr[2 * grid_len], zp, scale)) * 2.0; float box_h = sigmoid(deqnt_affine_to_f32(in_ptr[3 * grid_len], zp, scale)) * 2.0; box_x = (box_x + j) * (float)stride; box_y = (box_y + i) * (float)stride; box_w = box_w * box_w * (float)anchor[a * 2]; box_h = box_h * box_h * (float)anchor[a * 2 + 1]; box_x -= (box_w / 2.0); box_y -= (box_h / 2.0); boxes.push_back(box_x); //push_back() 在Vector最后添加一个元素 boxes.push_back(box_y); boxes.push_back(box_w); boxes.push_back(box_h); int8_t maxClassProbs = in_ptr[5 * grid_len]; int maxClassId = 0; for (int k = 1; k < OBJ_CLASS_NUM; ++k) { int8_t prob = in_ptr[(5 + k) * grid_len]; if (prob > maxClassProbs) { maxClassId = k; maxClassProbs = prob; } } objProbs.push_back(sigmoid(deqnt_affine_to_f32(maxClassProbs, zp, scale))); classId.push_back(maxClassId); validCount++; } } } } return validCount; }

解释代码:static int process(int8_t* input, int* anchor, int grid_h, int grid_w, int height, int width, int stride, std::vector<float>& boxes, std::vector<float>& objProbs, std::vector<int>& classId, float threshold, int32_t zp, float scale) { int validCount = 0; int grid_len = grid_h * grid_w; float thres = unsigmoid(threshold); int8_t thres_i8 = qnt_f32_to_affine(thres, zp, scale); for (int a = 0; a < 3; a++) { for (int i = 0; i < grid_h; i++) { for (int j = 0; j < grid_w; j++) { int8_t box_confidence = input[(PROP_BOX_SIZE * a + 4) * grid_len + i * grid_w + j]; if (box_confidence >= thres_i8) { int offset = (PROP_BOX_SIZE * a) * grid_len + i * grid_w + j; int8_t* in_ptr = input + offset; float box_x = sigmoid(deqnt_affine_to_f32(*in_ptr, zp, scale)) * 2.0 - 0.5; float box_y = sigmoid(deqnt_affine_to_f32(in_ptr[grid_len], zp, scale)) * 2.0 - 0.5; float box_w = sigmoid(deqnt_affine_to_f32(in_ptr[2 * grid_len], zp, scale)) * 2.0; float box_h = sigmoid(deqnt_affine_to_f32(in_ptr[3 * grid_len], zp, scale)) * 2.0; box_x = (box_x + j) * (float)stride; box_y = (box_y + i) * (float)stride; box_w = box_w * box_w * (float)anchor[a * 2]; box_h = box_h * box_h * (float)anchor[a * 2 + 1]; box_x -= (box_w / 2.0); box_y -= (box_h / 2.0); int8_t maxClassProbs = in_ptr[5 * grid_len]; int maxClassId = 0; for (int k = 1; k < OBJ_CLASS_NUM; ++k) { int8_t prob = in_ptr[(5 + k) * grid_len]; if (prob > maxClassProbs) { maxClassId = k; maxClassProbs = prob; } } if (maxClassProbs>thres_i8){ objProbs.push_back(sigmoid(deqnt_affine_to_f32(maxClassProbs, zp, scale))* sigmoid(deqnt_affine_to_f32(box_confidence, zp, scale))); classId.push_back(maxClassId); validCount++; boxes.push_back(box_x); boxes.push_back(box_y); boxes.push_back(box_w); boxes.push_back(box_h); } } } } } return validCount; }

最新推荐

recommend-type

华中科技大学电信专业 课程资料 作业 代码 实验报告-数据结构-内含源码和说明书.zip

华中科技大学电信专业 课程资料 作业 代码 实验报告-数据结构-内含源码和说明书.zip
recommend-type

java 游戏飞翔的小鸟

java 制作游戏 飞翔的小鸟
recommend-type

zigbee-cluster-library-specification

最新的zigbee-cluster-library-specification说明文档。
recommend-type

管理建模和仿真的文件

管理Boualem Benatallah引用此版本:布阿利姆·贝纳塔拉。管理建模和仿真。约瑟夫-傅立叶大学-格勒诺布尔第一大学,1996年。法语。NNT:电话:00345357HAL ID:电话:00345357https://theses.hal.science/tel-003453572008年12月9日提交HAL是一个多学科的开放存取档案馆,用于存放和传播科学研究论文,无论它们是否被公开。论文可以来自法国或国外的教学和研究机构,也可以来自公共或私人研究中心。L’archive ouverte pluridisciplinaire
recommend-type

实现实时数据湖架构:Kafka与Hive集成

![实现实时数据湖架构:Kafka与Hive集成](https://img-blog.csdnimg.cn/img_convert/10eb2e6972b3b6086286fc64c0b3ee41.jpeg) # 1. 实时数据湖架构概述** 实时数据湖是一种现代数据管理架构,它允许企业以低延迟的方式收集、存储和处理大量数据。与传统数据仓库不同,实时数据湖不依赖于预先定义的模式,而是采用灵活的架构,可以处理各种数据类型和格式。这种架构为企业提供了以下优势: - **实时洞察:**实时数据湖允许企业访问最新的数据,从而做出更明智的决策。 - **数据民主化:**实时数据湖使各种利益相关者都可
recommend-type

用 Python 画一个可以动的爱心

好的,我可以帮助您使用Python来绘制一个动态的爱心。您可以使用turtle库来实现。以下是实现代码: ```python import turtle import math # 设置画布和画笔 canvas = turtle.Screen() canvas.bgcolor("black") pencil = turtle.Turtle() pencil.speed(0) pencil.color("red", "pink") pencil.pensize(3) # 定义爱心函数 def draw_love(heart_size, x_offset=0, y_offset=0):
recommend-type

JSBSim Reference Manual

JSBSim参考手册,其中包含JSBSim简介,JSBSim配置文件xml的编写语法,编程手册以及一些应用实例等。其中有部分内容还没有写完,估计有生之年很难看到完整版了,但是内容还是很有参考价值的。
recommend-type

"互动学习:行动中的多样性与论文攻读经历"

多样性她- 事实上SCI NCES你的时间表ECOLEDO C Tora SC和NCESPOUR l’Ingén学习互动,互动学习以行动为中心的强化学习学会互动,互动学习,以行动为中心的强化学习计算机科学博士论文于2021年9月28日在Villeneuve d'Asq公开支持马修·瑟林评审团主席法布里斯·勒菲弗尔阿维尼翁大学教授论文指导奥利维尔·皮耶昆谷歌研究教授:智囊团论文联合主任菲利普·普雷教授,大学。里尔/CRISTAL/因里亚报告员奥利维耶·西格德索邦大学报告员卢多维奇·德诺耶教授,Facebook /索邦大学审查员越南圣迈IMT Atlantic高级讲师邀请弗洛里安·斯特鲁布博士,Deepmind对于那些及时看到自己错误的人...3谢谢你首先,我要感谢我的两位博士生导师Olivier和Philippe。奥利维尔,"站在巨人的肩膀上"这句话对你来说完全有意义了。从科学上讲,你知道在这篇论文的(许多)错误中,你是我可以依
recommend-type

实现实时监控告警系统:Kafka与Grafana整合

![实现实时监控告警系统:Kafka与Grafana整合](https://imgconvert.csdnimg.cn/aHR0cHM6Ly9tbWJpei5xcGljLmNuL21tYml6X2pwZy9BVldpY3ladXVDbEZpY1pLWmw2bUVaWXFUcEdLT1VDdkxRSmQxZXB5R1lxaWNlUjA2c0hFek5Qc3FyRktudFF1VDMxQVl3QTRXV2lhSWFRMEFRc0I1cW1ZOGcvNjQw?x-oss-process=image/format,png) # 1.1 Kafka集群架构 Kafka集群由多个称为代理的服务器组成,这
recommend-type

已知n个人(以编号0,1,2,3...n-1分别表示)围坐在一张圆桌周围。从编号为0的人开始报数1,数到m的那个人出列;他的下一个人又从1开始报数,数到m+1的那个人又出列(每次报数值加1);依此规律重复下去,直到圆桌周围的人全部出列。用递归方法解决

这个问题可以使用递归方法解决。下面是一个思路: 1. 定义一个函数,接收三个参数:n、m、i,表示还剩下n个人,每次数到m时出列,当前报数的人是i; 2. 如果n=1,返回i,即最后留下的那个人的编号; 3. 否则,计算出下一个出列的人的编号j,通过递归调用函数解决n-1个人的问题,其结果为k; 4. 如果k < j,即当前i之后出列的人的编号为k,需要将k转换为在i之前出列的编号,返回值为 k+(n-1); 5. 如果k>=j,即当前i之后出列的人的编号为k,返回值为 k-(j-1); 下面是对应的Python代码: ```python def josephus(n, m, i):