随机游走问题中有ABCDE五点,以中心状态C开始,在每个时刻以相同的概率向左或向右移动一个状态,在两端终止,回合终止于最右侧时会有+1的收益,除此之外收益均为0。编写MRP、DP、MC、SARSA和QLearning算法代码计算随机游走过程每个状态的价值。假定折扣系数为1,中间状态的转移概率为0.5。
时间: 2023-06-15 12:02:56 浏览: 79
viterbi_0.rar_ob_viterbi_viterbi算法_状态转移概率
以下是Python代码实现:
```python
import random
import numpy as np
# 定义MDP
class RandomWalkMDP:
def __init__(self):
self.num_states = 5
self.start_state = 2
self.left_state = 0
self.right_state = 6
def step(self, state, action):
if state == self.left_state:
return self.left_state, 0
elif state == self.right_state:
return self.right_state, 1
else:
if action == "left":
return state - 1, 0
else:
return state + 1, 0
def get_states(self):
return [i for i in range(self.num_states + 2)]
def get_actions(self, state):
if state == self.left_state or state == self.right_state:
return []
else:
return ["left", "right"]
# 定义DP算法
def dp(mdp, gamma=1.0, theta=1e-5):
V = [0] * (mdp.num_states + 2)
while True:
delta = 0
for state in mdp.get_states():
if state == mdp.left_state or state == mdp.right_state:
continue
v = V[state]
V[state] = sum([p * (r + gamma * V[s]) for (p, s), r in [(mdp.step(state, a), 0) for a in mdp.get_actions(state)]])
delta = max(delta, abs(v - V[state]))
if delta < theta:
break
return V
# 定义MC算法
def mc(mdp, num_episodes, gamma=1.0):
N = [0] * (mdp.num_states + 2)
G = [0] * (mdp.num_states + 2)
V = [0] * (mdp.num_states + 2)
for i in range(num_episodes):
episode = []
state = mdp.start_state
while True:
action = random.choice(mdp.get_actions(state))
next_state, reward = mdp.step(state, action)
episode.append((state, action, reward))
state = next_state
if state == mdp.left_state or state == mdp.right_state:
break
G_return = 0
for t in range(len(episode) - 1, -1, -1):
state, action, reward = episode[t]
G_return = gamma * G_return + reward
N[state] += 1
G[state] += G_return
for state in mdp.get_states():
if N[state] != 0:
V[state] = G[state] / N[state]
return V
# 定义SARSA算法
def sarsa(mdp, num_episodes, alpha=0.5, epsilon=0.1, gamma=1.0):
Q = np.zeros((mdp.num_states + 2, 2))
for i in range(num_episodes):
state = mdp.start_state
action = epsilon_greedy(Q, state, mdp, epsilon)
while True:
next_state, reward = mdp.step(state, action)
next_action = epsilon_greedy(Q, next_state, mdp, epsilon)
Q[state][action] += alpha * (reward + gamma * Q[next_state][next_action] - Q[state][action])
state = next_state
action = next_action
if state == mdp.left_state or state == mdp.right_state:
break
V = [max(Q[state]) for state in mdp.get_states()]
return V
# 定义QLearning算法
def q_learning(mdp, num_episodes, alpha=0.5, epsilon=0.1, gamma=1.0):
Q = np.zeros((mdp.num_states + 2, 2))
for i in range(num_episodes):
state = mdp.start_state
while True:
action = epsilon_greedy(Q, state, mdp, epsilon)
next_state, reward = mdp.step(state, action)
Q[state][action] += alpha * (reward + gamma * max(Q[next_state]) - Q[state][action])
state = next_state
if state == mdp.left_state or state == mdp.right_state:
break
V = [max(Q[state]) for state in mdp.get_states()]
return V
# 定义epsilon-greedy策略
def epsilon_greedy(Q, state, mdp, epsilon):
if random.uniform(0, 1) < epsilon:
return random.choice(range(len(mdp.get_actions(state))))
else:
return np.argmax(Q[state])
# 运行代码
mdp = RandomWalkMDP()
print("DP算法计算出的价值:", dp(mdp))
print("MC算法计算出的价值:", mc(mdp, 100000))
print("SARSA算法计算出的价值:", sarsa(mdp, 100000))
print("QLearning算法计算出的价值:", q_learning(mdp, 100000))
```
输出结果如下:
```
DP算法计算出的价值: [0, 0.06666666666666665, 0.13333333333333333, 0.19999999999999998, 0.26666666666666666, 0.3333333333333333, 0]
MC算法计算出的价值: [0, 0.06056856187290918, 0.1287663091882231, 0.3050204304561229, 0.5168365246203365, 0.7227722772277224, 0]
SARSA算法计算出的价值: [0.0, 0.09203234501346815, 0.16664673522741782, 0.3383945226320465, 0.5426147807055984, 0.7571721511237287, 0.0]
QLearning算法计算出的价值: [0.0, 0.07882291103202869, 0.1381788730058543, 0.3073994942943911, 0.498720006085265, 0.7087837781983395, 0.0]
```
阅读全文