maddpg pytorch
时间: 2023-06-21 12:07:58 浏览: 166
pytorch
MADDPG(Multi-Agent Deep Deterministic Policy Gradient)是一种用于多智能体协作的强化学习算法。在PyTorch中实现MADDPG,可以使用以下步骤:
1. 安装PyTorch和OpenAI Gym
```
pip install torch gym
```
2. 创建MADDPG模型的Actor和Critic网络。这些网络可以使用PyTorch的`nn.Module`类来创建。
```python
import torch.nn as nn
import torch.nn.functional as F
class Actor(nn.Module):
def __init__(self, state_dim, action_dim):
super(Actor, self).__init__()
self.fc1 = nn.Linear(state_dim, 128)
self.fc2 = nn.Linear(128, 128)
self.fc3 = nn.Linear(128, action_dim)
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.tanh(self.fc3(x))
return x
class Critic(nn.Module):
def __init__(self, state_dim, action_dim):
super(Critic, self).__init__()
self.fc1 = nn.Linear(state_dim + action_dim, 128)
self.fc2 = nn.Linear(128, 128)
self.fc3 = nn.Linear(128, 1)
def forward(self, states, actions):
x = torch.cat([states, actions], dim=1)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
```
3. 创建一个`ReplayBuffer`类来存储智能体的经验,并从中随机采样以进行训练。
```python
import random
from collections import deque, namedtuple
Experience = namedtuple('Experience', ['state', 'action', 'reward', 'next_state', 'done'])
class ReplayBuffer:
def __init__(self, capacity):
self.buffer = deque(maxlen=capacity)
def push(self, experience):
self.buffer.append(experience)
def sample(self, batch_size):
experiences = random.sample(self.buffer, batch_size)
states = torch.tensor([e.state for e in experiences], dtype=torch.float32)
actions = torch.tensor([e.action for e in experiences], dtype=torch.float32)
rewards = torch.tensor([e.reward for e in experiences], dtype=torch.float32)
next_states = torch.tensor([e.next_state for e in experiences], dtype=torch.float32)
dones = torch.tensor([e.done for e in experiences], dtype=torch.float32)
return states, actions, rewards, next_states, dones
def __len__(self):
return len(self.buffer)
```
4. 创建一个MADDPG智能体类,并实现训练和预测方法。
```python
class MADDPG:
def __init__(self, state_dim, action_dim, num_agents, buffer_capacity=100000, batch_size=128, gamma=0.99, tau=0.01, lr=0.001):
self.state_dim = state_dim
self.action_dim = action_dim
self.num_agents = num_agents
self.batch_size = batch_size
self.gamma = gamma
self.tau = tau
self.lr = lr
self.actor_local = [Actor(state_dim, action_dim) for _ in range(num_agents)]
self.actor_target = [Actor(state_dim, action_dim) for _ in range(num_agents)]
self.critic_local = Critic(state_dim * num_agents, action_dim * num_agents)
self.critic_target = Critic(state_dim * num_agents, action_dim * num_agents)
self.buffer = ReplayBuffer(buffer_capacity)
self.actor_optimizers = [torch.optim.Adam(actor.parameters(), lr=lr) for actor in self.actor_local]
self.critic_optimizer = torch.optim.Adam(self.critic_local.parameters(), lr=lr)
def act(self, states, noise=0.0):
actions = []
for i, state in enumerate(states):
state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
action = self.actor_local[i](state).squeeze(0).detach().numpy()
action += noise * np.random.randn(self.action_dim)
actions.append(np.clip(action, -1, 1))
return actions
def step(self, state, action, reward, next_state, done):
experience = Experience(state, action, reward, next_state, done)
self.buffer.push(experience)
if len(self.buffer) > self.batch_size:
self.learn()
def learn(self):
states, actions, rewards, next_states, dones = self.buffer.sample(self.batch_size)
# Critic update
Q_targets_next = self.critic_target(next_states.view(-1, self.state_dim * self.num_agents),
torch.cat([self.actor_target[i](next_states[:, i, :]) for i in range(self.num_agents)], dim=1))
Q_targets = rewards.view(-1, 1) + self.gamma * Q_targets_next * (1 - dones.view(-1, 1))
Q_expected = self.critic_local(states.view(-1, self.state_dim * self.num_agents), actions.view(-1, self.action_dim * self.num_agents))
critic_loss = F.mse_loss(Q_expected, Q_targets.detach())
self.critic_optimizer.zero_grad()
critic_loss.backward()
self.critic_optimizer.step()
# Actor update
for i in range(self.num_agents):
actor_loss = -self.critic_local(states.view(-1, self.state_dim * self.num_agents),
torch.cat([self.actor_local[j](states[:, j, :]) for j in range(self.num_agents) if j != i], dim=1),
self.actor_local[i](states[:, i, :])).mean()
self.actor_optimizers[i].zero_grad()
actor_loss.backward()
self.actor_optimizers[i].step()
# Soft update
self.soft_update(self.critic_local, self.critic_target, self.tau)
for i in range(self.num_agents):
self.soft_update(self.actor_local[i], self.actor_target[i], self.tau)
def soft_update(self, local_model, target_model, tau):
for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
```
5. 在OpenAI Gym环境中使用MADDPG智能体进行训练和预测。例如,在`MultiAgentPendulum-v0`环境中训练2个智能体,可以使用以下代码:
```python
import gym
import numpy as np
env = gym.make('MultiAgentPendulum-v0')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
num_agents = 2
agent = MADDPG(state_dim, action_dim, num_agents)
max_episodes = 1000
max_steps = 1000
for episode in range(max_episodes):
state = env.reset()
for step in range(max_steps):
action = agent.act(state)
next_state, reward, done, _ = env.step(action)
agent.step(state, action, reward, next_state, done)
state = next_state
if np.any(done):
break
if episode % 100 == 0:
print('Episode {}/{}'.format(episode, max_episodes))
# Test
state = env.reset()
for step in range(max_steps):
action = agent.act(state, noise=0.0)
next_state, _, done, _ = env.step(action)
state = next_state
env.render()
if np.any(done):
break
env.close()
```
阅读全文