maddpg结合优先经验回放 tensorflow代码

时间: 2023-09-30 17:09:02 浏览: 54
以下是使用Tensorflow实现MADDPG算法并结合优先经验回放的示例代码: ``` python import numpy as np import tensorflow as tf import random # 定义神经网络模型 class ActorNetwork: def __init__(self, sess, state_dim, action_dim, action_bound, learning_rate, tau): self.sess = sess self.s_dim = state_dim self.a_dim = action_dim self.action_bound = action_bound self.learning_rate = learning_rate self.tau = tau # 创建actor网络 self.inputs, self.out, self.scaled_out = self.create_actor_network() # 创建actor目标网络 self.target_inputs, self.target_out, self.target_scaled_out = self.create_actor_network() # 定义actor网络参数 self.network_params = tf.trainable_variables() # 定义actor目标网络参数 self.target_network_params = tf.trainable_variables()[len(self.network_params):] # 定义actor目标网络更新操作 self.update_target_network_params = \ [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) + tf.multiply(self.target_network_params[i], 1. - self.tau)) for i in range(len(self.target_network_params))] # 定义actor网络梯度 self.action_gradient = tf.placeholder(tf.float32, [None, self.a_dim]) # 定义actor网络参数梯度 self.unnormalized_actor_gradients = tf.gradients(self.scaled_out, self.network_params, -self.action_gradient) self.actor_gradients = list(map(lambda x: tf.div(x, self.batch_size), self.unnormalized_actor_gradients)) # 定义actor网络优化器 self.optimizer = tf.train.AdamOptimizer(self.learning_rate).\ apply_gradients(zip(self.actor_gradients, self.network_params)) # 初始化网络参数 self.sess.run(tf.global_variables_initializer()) # 创建actor网络 def create_actor_network(self): inputs = tf.placeholder(tf.float32, [None, self.s_dim]) w1 = tf.Variable(tf.random_normal([self.s_dim, 64])) b1 = tf.Variable(tf.random_normal([64])) net = tf.nn.relu(tf.matmul(inputs, w1) + b1) w2 = tf.Variable(tf.random_normal([64, 32])) b2 = tf.Variable(tf.random_normal([32])) net = tf.nn.relu(tf.matmul(net, w2) + b2) w3 = tf.Variable(tf.random_normal([32, self.a_dim])) b3 = tf.Variable(tf.random_normal([self.a_dim])) out = tf.matmul(net, w3) + b3 scaled_out = tf.multiply(out, self.action_bound) return inputs, out, scaled_out # 计算actor网络梯度 def actor_gradient(self, inputs, action_gradients, batch_size): self.batch_size = batch_size self.sess.run(self.optimizer, feed_dict={ self.inputs: inputs, self.action_gradient: action_gradients }) # 预测动作 def predict(self, inputs): return self.sess.run(self.scaled_out, feed_dict={ self.inputs: inputs }) # 更新actor目标网络 def update_target_network(self): self.sess.run(self.update_target_network_params) # 获取actor目标网络参数 def get_target_network_params(self): return self.sess.run(self.target_network_params) # 保存actor网络参数 def save_network(self, save_path): saver = tf.train.Saver() saver.save(self.sess, save_path) # 加载actor网络参数 def load_network(self, load_path): saver = tf.train.Saver() saver.restore(self.sess, load_path) class CriticNetwork: def __init__(self, sess, state_dim, action_dim, learning_rate, tau, gamma, num_agents): self.sess = sess self.s_dim = state_dim self.a_dim = action_dim self.learning_rate = learning_rate self.tau = tau self.gamma = gamma self.num_agents = num_agents # 创建critic网络 self.inputs, self.action, self.out = self.create_critic_network() # 创建critic目标网络 self.target_inputs, self.target_action, self.target_out = self.create_critic_network() # 定义critic网络参数 self.network_params = tf.trainable_variables()[self.num_agents * 2:] # 定义critic目标网络参数 self.target_network_params = tf.trainable_variables()[(self.num_agents * 2) + len(self.network_params):] # 定义critic目标网络更新操作 self.update_target_network_params = \ [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) + tf.multiply(self.target_network_params[i], 1. - self.tau)) for i in range(len(self.target_network_params))] # 定义critic网络梯度 self.predicted_q_value = tf.placeholder(tf.float32, [None, 1]) self.loss = tf.reduce_mean(tf.square(self.predicted_q_value - self.out)) self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss) # 定义critic网络梯度 self.action_gradients = tf.gradients(self.out, self.action) # 初始化网络参数 self.sess.run(tf.global_variables_initializer()) # 创建critic网络 def create_critic_network(self): inputs = tf.placeholder(tf.float32, [None, self.s_dim]) action = tf.placeholder(tf.float32, [None, self.a_dim]) w1 = tf.Variable(tf.random_normal([self.s_dim, 64])) b1 = tf.Variable(tf.random_normal([64])) net = tf.nn.relu(tf.matmul(inputs, w1) + b1) w2 = tf.Variable(tf.random_normal([64, 32])) b2 = tf.Variable(tf.random_normal([32])) action_net = tf.nn.relu(tf.matmul(action, w2) + b2) w2_ = tf.Variable(tf.random_normal([32, 32])) b2_ = tf.Variable(tf.random_normal([32])) net_ = tf.nn.relu(tf.matmul(net, w2_) + tf.matmul(action_net, w2_) + b2_) w3 = tf.Variable(tf.random_normal([32, 1])) b3 = tf.Variable(tf.random_normal([1])) out = tf.matmul(net_, w3) + b3 return inputs, action, out # 计算critic网络梯度 def critic_gradient(self, inputs, action): return self.sess.run(self.action_gradients, feed_dict={ self.inputs: inputs, self.action: action })[0] # 训练critic网络 def train(self, inputs, action, predicted_q_value): self.sess.run(self.optimizer, feed_dict={ self.inputs: inputs, self.action: action, self.predicted_q_value: predicted_q_value }) # 预测Q值 def predict(self, inputs, action): return self.sess.run(self.out, feed_dict={ self.inputs: inputs, self.action: action }) # 更新critic目标网络 def update_target_network(self): self.sess.run(self.update_target_network_params) # 获取critic目标网络参数 def get_target_network_params(self): return self.sess.run(self.target_network_params) # 保存critic网络参数 def save_network(self, save_path): saver = tf.train.Saver() saver.save(self.sess, save_path) # 加载critic网络参数 def load_network(self, load_path): saver = tf.train.Saver() saver.restore(self.sess, load_path) # 定义优先经验回放缓存类 class ReplayBuffer: def __init__(self, buffer_size, batch_size): self.buffer_size = buffer_size self.batch_size = batch_size self.buffer = [] self.priorities = np.zeros((buffer_size,), dtype=np.float32) self.pos = 0 # 添加经验到缓存 def add(self, state, action, reward, next_state, done): max_prio = self.priorities.max() if self.buffer else 1.0 self.buffer.append((state, action, reward, next_state, done)) self.priorities[self.pos] = max_prio self.pos = (self.pos + 1) % self.buffer_size # 计算重要性采样权重 def _get_weights(self, prob, num_samples): weights = (self.buffer_size * prob) ** (-1 * num_samples) return weights / weights.max() # 从缓存中随机采样经验 def sample(self, beta): prob = self.priorities / self.priorities.sum() indices = np.random.choice(len(self.buffer), self.batch_size, p=prob) samples = [self.buffer[idx] for idx in indices] weights = self._get_weights(prob[indices], len(self.buffer)) states, actions, rewards, next_states, dones = zip(*samples) return states, actions, rewards, next_states, dones, indices, weights # 更新优先级 def update_priorities(self, indices, td_errors): for idx, td_error in zip(indices, td_errors): self.priorities[idx] = abs(td_error) + 1e-6 # 定义MADDPG算法类 class MADDPG: def __init__(self, sess, state_dim, action_dim, action_bound, learning_rate_actor, learning_rate_critic, tau, gamma, memory_size, batch_size, num_agents, prioritized_replay=False): self.sess = sess self.state_dim = state_dim self.action_dim = action_dim self.action_bound = action_bound self.learning_rate_actor = learning_rate_actor self.learning_rate_critic = learning_rate_critic self.tau = tau self.gamma = gamma self.memory_size = memory_size self.batch_size = batch_size self.num_agents = num_agents self.prioritized_replay = prioritized_replay # 创建actor网络 self.actors = [] for i in range(num_agents): actor = ActorNetwork(sess, state_dim, action_dim, action_bound, learning_rate_actor, tau) self.actors.append(actor) # 创建critic网络 self.critics = [] for i in range(num_agents): critic = CriticNetwork(sess, state_dim, action_dim, learning_rate_critic, tau, gamma, i) self.critics.append(critic) # 创建优先经验回放缓存 if prioritized_replay: self.memory = ReplayBuffer(memory_size, batch_size) else: self.memory = [] self.memory_size = memory_size # 初始化MADDPG网络参数 self.sess.run(tf.global_variables_initializer()) # 预测动作 def predict(self, inputs): actions = [] for i in range(self.num_agents): action = self.actors[i].predict(inputs[i]) actions.append(action) return actions # 更新MADDPG网络 def update(self): if len(self.memory) < self.batch_size: return # 从缓存中随机采样经验 if self.prioritized_replay: states, actions, rewards, next_states, dones, indices, weights = self.memory.sample(1.0) else: samples = random.sample(self.memory, self.batch_size) states, actions, rewards, next_states, dones = zip(*samples) # 更新critic网络参数 target_next_actions = [] for i in range(self.num_agents): target_next_action = self.actors[i].predict(next_states[i]) target_next_actions.append(target_next_action) target_next_actions = np.concatenate(target_next_actions, axis=1) q_value = [] for i in range(self.num_agents): q = self.critics[i].predict(states[i], actions[i]) q_value.append(q) q_value = np.concatenate(q_value, axis=1) next_q_value = [] for i in range(self.num_agents): next_q = self.critics[i].predict(next_states[i], target_next_actions) next_q_value.append(next_q) next_q_value = np.concatenate(next_q_value, axis=1) td_targets = [] for i in range(self.num_agents): td_target = [] for j in range(self.batch_size): if dones[j]: td_target.append(rewards[i][j]) else: td_target.append(rewards[i][j] + self.gamma * next_q_value[j]) td_targets.append(np.reshape(td_target, [-1, 1])) for i in range(self.num_agents): td_error = td_targets[i] - q_value[:, i:i + 1] if self.prioritized_replay: self.memory.update_priorities(indices, td_error) self.critics[i].train(states[i], actions[i], td_targets[i]) # 更新actor网络参数 actions = [] for i in range(self.num_agents): action = self.actors[i].predict(states[i]) actions.append(action) actions = np.concatenate(actions, axis=1) critic_gradients = [] for i in range(self.num_agents): critic_gradient = self.critics[i].critic_gradient(states[i], actions) critic_gradients.append(critic_gradient) critic_gradients = np.concatenate(critic_gradients, axis=1) for i in range(self.num_agents): self.actors[i].actor_gradient(states[i], critic_gradients[:, i:i + self.action_dim], self.batch_size) # 更新目标网络 for i in range(self.num_agents): self.actors[i].update_target_network() self.critics[i].update_target_network() # 添加经验到缓存 def add_experience(self, state, action, reward, next_state, done): if self.prioritized_replay: max_prio = self.memory.priorities.max() if self.memory.buffer else 1.0 self.memory.add(state, action, reward, next_state, done) else: if len(self.memory) < self.memory_size: self.memory.append((state, action, reward, next_state, done)) else: self.memory.pop(0) self.memory.append((state, action, reward, next_state, done)) # 保存MADDPG网络参数 def save_network(self, save_path): for i in range(self.num_agents): self.actors[i].save_network(save_path + '_actor' + str(i)) self.critics[i].save_network(save_path + '_critic' + str(i)) # 加载MADDPG网络参数 def load_network(self, load_path): for i in range(self.num_agents): self.actors[i].load_network(load_path + '_actor' + str(i)) self.critics[i].load_network(load_path + '_critic' + str(i)) ``` 其中,ActorNetwork和CriticNetwork分别表示Actor网络和Critic网络的类,ReplayBuffer表示优先经验回放缓存的类,MADDPG表示使用MADDPG算法的类。该代码中使用了Tensorflow作为深度学习框架,可以根据自己的实际需求进行修改。

相关推荐

最新推荐

recommend-type

微信小程序实现轨迹回放的示例代码

主要介绍了微信小程序实现轨迹回放的示例代码,文中通过示例代码介绍的非常详细,对大家的学习或者工作具有一定的参考学习价值,需要的朋友们下面随着小编来一起学习学习吧
recommend-type

Openlayers3实现车辆轨迹回放功能

主要介绍了Openlayers3实现车辆轨迹回放功能,文中示例代码介绍的非常详细,具有一定的参考价值,感兴趣的小伙伴们可以参考一下
recommend-type

zigbee-cluster-library-specification

最新的zigbee-cluster-library-specification说明文档。
recommend-type

管理建模和仿真的文件

管理Boualem Benatallah引用此版本:布阿利姆·贝纳塔拉。管理建模和仿真。约瑟夫-傅立叶大学-格勒诺布尔第一大学,1996年。法语。NNT:电话:00345357HAL ID:电话:00345357https://theses.hal.science/tel-003453572008年12月9日提交HAL是一个多学科的开放存取档案馆,用于存放和传播科学研究论文,无论它们是否被公开。论文可以来自法国或国外的教学和研究机构,也可以来自公共或私人研究中心。L’archive ouverte pluridisciplinaire
recommend-type

实现实时数据湖架构:Kafka与Hive集成

![实现实时数据湖架构:Kafka与Hive集成](https://img-blog.csdnimg.cn/img_convert/10eb2e6972b3b6086286fc64c0b3ee41.jpeg) # 1. 实时数据湖架构概述** 实时数据湖是一种现代数据管理架构,它允许企业以低延迟的方式收集、存储和处理大量数据。与传统数据仓库不同,实时数据湖不依赖于预先定义的模式,而是采用灵活的架构,可以处理各种数据类型和格式。这种架构为企业提供了以下优势: - **实时洞察:**实时数据湖允许企业访问最新的数据,从而做出更明智的决策。 - **数据民主化:**实时数据湖使各种利益相关者都可
recommend-type

list根据id查询pid 然后依次获取到所有的子节点数据

可以使用递归的方式来实现根据id查询pid并获取所有子节点数据。具体实现可以参考以下代码: ``` def get_children_nodes(nodes, parent_id): children = [] for node in nodes: if node['pid'] == parent_id: node['children'] = get_children_nodes(nodes, node['id']) children.append(node) return children # 测试数
recommend-type

JSBSim Reference Manual

JSBSim参考手册,其中包含JSBSim简介,JSBSim配置文件xml的编写语法,编程手册以及一些应用实例等。其中有部分内容还没有写完,估计有生之年很难看到完整版了,但是内容还是很有参考价值的。
recommend-type

"互动学习:行动中的多样性与论文攻读经历"

多样性她- 事实上SCI NCES你的时间表ECOLEDO C Tora SC和NCESPOUR l’Ingén学习互动,互动学习以行动为中心的强化学习学会互动,互动学习,以行动为中心的强化学习计算机科学博士论文于2021年9月28日在Villeneuve d'Asq公开支持马修·瑟林评审团主席法布里斯·勒菲弗尔阿维尼翁大学教授论文指导奥利维尔·皮耶昆谷歌研究教授:智囊团论文联合主任菲利普·普雷教授,大学。里尔/CRISTAL/因里亚报告员奥利维耶·西格德索邦大学报告员卢多维奇·德诺耶教授,Facebook /索邦大学审查员越南圣迈IMT Atlantic高级讲师邀请弗洛里安·斯特鲁布博士,Deepmind对于那些及时看到自己错误的人...3谢谢你首先,我要感谢我的两位博士生导师Olivier和Philippe。奥利维尔,"站在巨人的肩膀上"这句话对你来说完全有意义了。从科学上讲,你知道在这篇论文的(许多)错误中,你是我可以依
recommend-type

实现实时监控告警系统:Kafka与Grafana整合

![实现实时监控告警系统:Kafka与Grafana整合](https://imgconvert.csdnimg.cn/aHR0cHM6Ly9tbWJpei5xcGljLmNuL21tYml6X2pwZy9BVldpY3ladXVDbEZpY1pLWmw2bUVaWXFUcEdLT1VDdkxRSmQxZXB5R1lxaWNlUjA2c0hFek5Qc3FyRktudFF1VDMxQVl3QTRXV2lhSWFRMEFRc0I1cW1ZOGcvNjQw?x-oss-process=image/format,png) # 1.1 Kafka集群架构 Kafka集群由多个称为代理的服务器组成,这
recommend-type

未定义标识符CFileFind

CFileFind 是MFC(Microsoft Foundation Class)中的一个类,用于在Windows文件系统中搜索文件和目录。如果你在使用CFileFind时出现了“未定义标识符”的错误,可能是因为你没有包含MFC头文件或者没有链接MFC库。你可以检查一下你的代码中是否包含了以下头文件: ```cpp #include <afx.h> ``` 另外,如果你在使用Visual Studio开发,还需要在项目属性中将“使用MFC”设置为“使用MFC的共享DLL”。这样才能正确链接MFC库。