多智能体DQN代码Pytorch
时间: 2023-09-17 16:07:50 浏览: 97
DQN pytorch_pytorch_pytorchcnndqn_q学习_DQN_deepqlearning
5星 · 资源好评率100%
以下是多智能体DQN的Pytorch代码:
```python
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import random
from collections import namedtuple, deque
# 定义超参数
BUFFER_SIZE = int(1e5) # replay buffer的大小
BATCH_SIZE = 64 # mini-batch的大小
GAMMA = 0.99 # discount factor
TAU = 1e-3 # soft update的参数
LR = 5e-4 # learning rate
UPDATE_EVERY = 4 # 更新的时间间隔
# 检查GPU是否可用
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# 定义多智能体DQN的类
class MADQNAgent:
def __init__(self, state_size, action_size, num_agents, seed):
"""初始化多智能体DQN的类
参数:
state_size:每个智能体状态的维度
action_size:每个智能体动作的维度
num_agents:智能体的数量
seed:随机数生成器的种子
"""
self.state_size = state_size
self.action_size = action_size
self.num_agents = num_agents
self.seed = random.seed(seed)
# 定义Q网络和target网络
self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
# 定义replay buffer
self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
# 定义时间步数
self.t_step = 0
def step(self, state, action, reward, next_state, done):
# 添加经验到replay buffer中
self.memory.add(state, action, reward, next_state, done)
# 更新时间步数
self.t_step = (self.t_step + 1) % UPDATE_EVERY
# 如果replay buffer中的样本数大于BATCH_SIZE,并且到了更新时间间隔,则进行学习
if len(self.memory) > BATCH_SIZE and self.t_step == 0:
experiences = self.memory.sample()
self.learn(experiences, GAMMA)
def act(self, state, eps=0.):
# 选择动作
state = torch.from_numpy(state).float().to(device)
self.qnetwork_local.eval()
with torch.no_grad():
action_values = self.qnetwork_local(state)
self.qnetwork_local.train()
# 添加噪声
action_values = action_values.cpu().data.numpy()
actions = np.zeros((self.num_agents, self.action_size))
for i in range(self.num_agents):
actions[i] = np.argmax(action_values[i])
if random.random() < eps:
actions[i] = random.choice(np.arange(self.action_size))
return actions
def learn(self, experiences, gamma):
states, actions, rewards, next_states, dones = experiences
# 计算Q值
Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
Q_expected = self.qnetwork_local(states).gather(1, actions)
# 计算误差,并更新Q网络
loss = F.mse_loss(Q_expected, Q_targets)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
# 更新target网络
self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)
def soft_update(self, local_model, target_model, tau):
for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
# 定义Q网络的类
class QNetwork(nn.Module):
def __init__(self, state_size, action_size, seed):
"""初始化Q网络的类
参数:
state_size:状态的维度
action_size:动作的维度
seed:随机数生成器的种子
"""
super(QNetwork, self).__init__()
self.seed = torch.manual_seed(seed)
self.fc1 = nn.Linear(state_size, 64)
self.fc2 = nn.Linear(64, 64)
self.fc3 = nn.Linear(64, action_size)
def forward(self, state):
"""前向传播
参数:
state:状态
返回:
给定状态下的Q值
"""
x = F.relu(self.fc1(state))
x = F.relu(self.fc2(x))
return self.fc3(x)
# 定义replay buffer的类
class ReplayBuffer:
def __init__(self, action_size, buffer_size, batch_size, seed):
"""初始化replay buffer的类
参数:
action_size:动作的维度
buffer_size:replay buffer的大小
batch_size:mini-batch的大小
seed:随机数生成器的种子
"""
self.action_size = action_size
self.memory = deque(maxlen=buffer_size)
self.batch_size = batch_size
self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
self.seed = random.seed(seed)
def add(self, state, action, reward, next_state, done):
"""添加经验到replay buffer中"""
e = self.experience(state, action, reward, next_state, done)
self.memory.append(e)
def sample(self):
"""从replay buffer中随机采样"""
experiences = random.sample(self.memory, k=self.batch_size)
# 将经验转换为tensor
states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
return (states, actions, rewards, next_states, dones)
def __len__(self):
"""返回replay buffer中的样本数"""
return len(self.memory)
```
注意:这是多智能体DQN的代码,与普通DQN的代码略有不同。需要构建多个智能体的Q网络,并且在学习时需要对每个智能体的经验进行采样和更新。
阅读全文