使用python 写一个DQN 强化学习 的程序
时间: 2024-02-14 13:08:54 浏览: 124
好的,下面是一个基本的DQN强化学习的程序,使用Python和PyTorch实现:
首先,我们需要导入所需的库和模块:
```python
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import gym
```
接下来,我们定义一个神经网络类,用于表示Q值函数:
```python
class QNetwork(nn.Module):
def __init__(self, input_dim, output_dim):
super(QNetwork, self).__init__()
self.fc1 = nn.Linear(input_dim, 64)
self.fc2 = nn.Linear(64, 64)
self.fc3 = nn.Linear(64, output_dim)
def forward(self, state):
x = torch.relu(self.fc1(state))
x = torch.relu(self.fc2(x))
x = self.fc3(x)
return x
```
然后,我们定义一个DQN类,用于实现DQN算法:
```python
class DQNAgent:
def __init__(self, env):
self.env = env
self.obs_dim = env.observation_space.shape[0]
self.action_dim = env.action_space.n
self.q_net = QNetwork(self.obs_dim, self.action_dim)
self.target_q_net = QNetwork(self.obs_dim, self.action_dim)
self.target_q_net.load_state_dict(self.q_net.state_dict())
self.optimizer = optim.Adam(self.q_net.parameters(), lr=0.001)
self.gamma = 0.99
self.epsilon = 1.0
self.epsilon_decay = 0.995
self.min_epsilon = 0.01
self.batch_size = 32
self.replay_buffer = []
self.replay_buffer_max_size = 10000
def act(self, state):
if np.random.rand() < self.epsilon:
return self.env.action_space.sample()
state = torch.tensor(state, dtype=torch.float32)
q_values = self.q_net(state)
return q_values.argmax().item()
def update_replay_buffer(self, state, action, reward, next_state, done):
self.replay_buffer.append((state, action, reward, next_state, done))
if len(self.replay_buffer) > self.replay_buffer_max_size:
self.replay_buffer.pop(0)
def sample_from_replay_buffer(self):
samples = random.sample(self.replay_buffer, self.batch_size)
state, action, reward, next_state, done = map(np.array, zip(*samples))
return state, action, reward, next_state, done
def update_q_net(self):
state, action, reward, next_state, done = self.sample_from_replay_buffer()
state = torch.tensor(state, dtype=torch.float32)
action = torch.tensor(action, dtype=torch.long).unsqueeze(1)
reward = torch.tensor(reward, dtype=torch.float32).unsqueeze(1)
next_state = torch.tensor(next_state, dtype=torch.float32)
done = torch.tensor(done, dtype=torch.float32).unsqueeze(1)
q_values = self.q_net(state).gather(1, action)
next_q_values = self.target_q_net(next_state).max(1)[0].unsqueeze(1)
target_q_values = reward + (1 - done) * self.gamma * next_q_values
loss = nn.functional.smooth_l1_loss(q_values, target_q_values.detach())
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
def update_target_q_net(self):
self.target_q_net.load_state_dict(self.q_net.state_dict())
def train(self, num_episodes):
for i in range(num_episodes):
state = self.env.reset()
episode_reward = 0
done = False
while not done:
action = self.act(state)
next_state, reward, done, _ = self.env.step(action)
episode_reward += reward
self.update_replay_buffer(state, action, reward, next_state, done)
state = next_state
if len(self.replay_buffer) >= self.batch_size:
self.update_q_net()
if len(self.replay_buffer) % 100 == 0:
self.update_target_q_net()
self.epsilon = max(self.epsilon * self.epsilon_decay, self.min_epsilon)
print(f"Episode {i + 1}: Reward = {episode_reward}")
```
最后,我们可以使用上述DQNAgent类来训练我们的模型:
```python
env = gym.make("CartPole-v1")
agent = DQNAgent(env)
agent.train(num_episodes=1000)
```
上述代码将创建一个CartPole-v1环境,并使用DQN算法进行训练,共进行1000个Episode的训练。
阅读全文