时间: 2023-09-15 14:15:20 浏览: 192
python q-learning
4星 · 用户满意度95%
import numpy as np
# 定义迷宫状态空间
states = [0, 1, 2, 3, 4, 5, 6, 7, 8]
# 定义动作空间
actions = [0, 1, 2, 3] # 上下左右
# 定义奖励函数
rewards = np.array([
[-1, -1, -1],
[-1, 0, -1],
[-1, -1, 1]
# 初始化Q函数
q = np.zeros((len(states), len(actions)))
# 定义学习率和折扣因子
alpha = 0.5
gamma = 0.9
# 迭代1000次
for i in range(1000):
# 选择起始状态
state = 0
# 当前状态不为目标状态时执行动作
while state != 8:
# 选择动作
if np.random.uniform() < 0.1:
action = np.random.choice(actions)
action = np.argmax(q[state])
# 执行动作并观察下一个状态和奖励
next_state = state + [-3, 3, -1, 1][action]
reward = rewards[next_state // 3, next_state % 3]
# 更新Q函数
q[state][action] = q[state][action] + alpha * (reward + gamma * np.max(q[next_state]) - q[state][action])
# 转移到下一个状态
state = next_state
# 从起始状态开始执行动作
state = 0
while state != 8:
action = np.argmax(q[state])
next_state = state + [-3, 3, -1, 1][action]
state = next_state
print("Reached the goal!")
import numpy as np
# 定义迷宫状态空间
states = [0, 1, 2, 3, 4, 5, 6, 7, 8]
# 定义动作空间
actions = [0, 1, 2, 3] # 上下左右
# 定义奖励函数
rewards = np.array([
[-1, -1, -1],
[-1, 0, -1],
[-1, -1, 1]
# 初始化Q函数
q = np.zeros((len(states), len(actions)))
# 定义学习率和折扣因子
alpha = 0.5
gamma = 0.9
# 迭代1000次
for i in range(1000):
# 选择起始状态
state = 0
# 当前状态不为目标状态时执行动作
while state != 8:
# 选择动作
if np.random.uniform() < 0.1:
action = np.random.choice(actions)
action = np.argmax(q[state])
# 执行动作并观察下一个状态和奖励
next_state = state + [-3, 3, -1, 1][action]
reward = rewards[next_state // 3, next_state % 3]
# 更新Q函数
q[state][action] = q[state][action] + alpha * (reward + gamma * np.max(q[next_state]) - q[state][action])
# 转移到下一个状态
state = next_state
# 从起始状态开始执行动作
state = 0
while state != 8:
action = np.argmax(q[state])
next_state = state + [-3, 3, -1, 1][action]
state = next_state
print("Reached the goal!")