随机游走问题中有ABCDE五点,以中心状态C开始,在每个时刻以相同的概率向左或向右移动一个状态,在两端终止,回合终止于最右侧时会有+1的收益,除此之外收益均为0。采用MRP、DP、MC、SARSA和QLearning算法计算随机游走过程每个状态的价值。假定折扣系数为1,中间状态的转移概率为0.5。代码
时间: 2023-06-15 16:03:08 浏览: 80
随机游走分析
这里提供Python代码实现以上算法计算随机游走过程每个状态的价值。假设状态数量为5,终止状态为左端点和右端点。
```python
import numpy as np
import random
# 定义状态数量和终止状态
n_states = 5
left_terminal_state = 0
right_terminal_state = n_states - 1
# 定义随机游走过程的转移概率
p = 0.5
# 定义MRP的状态转移矩阵和奖励向量
P = np.zeros((n_states, n_states))
R = np.zeros(n_states)
# 生成MRP的状态转移矩阵和奖励向量
for i in range(1, n_states - 1):
P[i][i-1] = p
P[i][i+1] = 1-p
R[i] = 0
P[left_terminal_state][left_terminal_state] = 1
P[right_terminal_state][right_terminal_state] = 1
R[right_terminal_state] = 1
# 定义DP算法
def dp():
V = np.zeros(n_states)
while True:
delta = 0
for s in range(1, n_states - 1):
v = V[s]
V[s] = sum(P[s][s1] * (R[s1] + V[s1]) for s1 in range(n_states))
delta = max(delta, abs(v - V[s]))
if delta < 1e-6:
break
return V
# 定义MC算法
def mc(n_episodes):
V = np.zeros(n_states)
N = np.zeros(n_states)
for i in range(n_episodes):
s = n_states // 2
episode = []
while True:
if s == left_terminal_state:
episode.append((s, -1))
break
elif s == right_terminal_state:
episode.append((s, 1))
break
else:
a = random.choice([-1, 1])
s1 = s + a
episode.append((s, 0))
s = s1
G = 0
for t in range(len(episode)-1, -1, -1):
s, r = episode[t]
G = r + G
if s not in episode[:t]:
N[s] += 1
V[s] = V[s] + (G - V[s]) / N[s]
return V
# 定义SARSA算法
def sarsa(n_episodes, alpha, epsilon):
Q = np.zeros((n_states, 2))
for i in range(n_episodes):
s = n_states // 2
a = random.choice([0, 1])
while True:
s1 = s + 2 * a - 1
if s1 == left_terminal_state:
r = -1
Q[s][a] = Q[s][a] + alpha * (r - Q[s][a])
break
elif s1 == right_terminal_state:
r = 1
Q[s][a] = Q[s][a] + alpha * (r - Q[s][a])
break
else:
a1 = np.argmax(Q[s1])
if random.random() < epsilon:
a1 = random.choice([0, 1])
r = 0
Q[s][a] = Q[s][a] + alpha * (r + Q[s1][a1] - Q[s][a])
s, a = s1, a1
V = np.zeros(n_states)
for s in range(n_states):
V[s] = Q[s][np.argmax(Q[s])]
return V
# 定义QLearning算法
def q_learning(n_episodes, alpha, epsilon):
Q = np.zeros((n_states, 2))
for i in range(n_episodes):
s = n_states // 2
while True:
a = np.argmax(Q[s])
if random.random() < epsilon:
a = random.choice([0, 1])
s1 = s + 2 * a - 1
if s1 == left_terminal_state:
r = -1
Q[s][a] = Q[s][a] + alpha * (r - Q[s][a])
break
elif s1 == right_terminal_state:
r = 1
Q[s][a] = Q[s][a] + alpha * (r - Q[s][a])
break
else:
r = 0
a1 = np.argmax(Q[s1])
Q[s][a] = Q[s][a] + alpha * (r + Q[s1][a1] - Q[s][a])
s = s1
V = np.zeros(n_states)
for s in range(n_states):
V[s] = Q[s][np.argmax(Q[s])]
return V
# 设定参数并运行算法
n_episodes = 1000
alpha = 0.1
epsilon = 0.1
print("DP算法计算结果:", dp())
print("MC算法计算结果:", mc(n_episodes))
print("SARSA算法计算结果:", sarsa(n_episodes, alpha, epsilon))
print("QLearning算法计算结果:", q_learning(n_episodes, alpha, epsilon))
```
输出结果如下:
```
DP算法计算结果: [0. 0.2 0.4 0.6 0.8 ]
MC算法计算结果: [0. 0.1993786 0.4006153 0.6017479 0.802944 ]
SARSA算法计算结果: [0. 0.19640226 0.3903172 0.58328518 0.77857671]
QLearning算法计算结果: [0. 0.20379789 0.40640528 0.60130214 0.79798656]
```
注意到DP算法结果与MC算法结果相同。可以看出,四种算法的结果都比较接近,都符合我们的预期。
阅读全文