2.1根据查阅资料,python编写出MAB的 Softmax算法(或Epsilon-Greedy算法),BetaThompson sampling算法,UCB算法以及LinUCB算法。 2.2python编写评估方法对(1)各算法的累积reward(2)各算法每轮的reward(3)各算法选取最优arm的概率变化情况(4)算法选取arm的方差情况进行可视化呈现。 2.3编写程序产生一个数据集,文件名为data.txt。本数据集为10000行110列的由数字组成的文本文件,第一列由1-10产生的随机整数,第二列由0-1组成的随机整数,第3列至110列由0-99产生的稀疏矩阵,0占80%。 2用该数据集进行对上述算法进行测试,计算出四种算法平均reward值,绘制单次模拟后每轮累计reward,测试轮数设置为1000.
时间: 2023-12-01 15:21:41 浏览: 146
2.1 MAB算法Python实现:
1. Softmax算法
```python
import numpy as np
def softmax(q, tau):
prob = np.exp(q/tau) / np.sum(np.exp(q/tau))
return prob
def softmax_select(q, tau):
prob = softmax(q, tau)
action = np.random.choice(len(q), p=prob)
return action
```
2. Epsilon-Greedy算法
```python
import numpy as np
def epsilon_greedy(q, epsilon):
if np.random.random() < epsilon:
action = np.random.choice(len(q))
else:
action = np.argmax(q)
return action
```
3. BetaThompson sampling算法
```python
import numpy as np
def beta_thompson(alpha, beta):
samples = np.random.beta(alpha, beta)
action = np.argmax(samples)
return action
def beta_thompson_update(alpha, beta, action, reward):
alpha[action] += reward
beta[action] += 1 - reward
return alpha, beta
```
4. UCB算法
```python
import numpy as np
def ucb(q, n, c):
ucb_values = q + c * np.sqrt(np.log(n+1) / (n+1e-6))
action = np.argmax(ucb_values)
return action
def ucb_update(q, n, action, reward):
q[action] = q[action] + (reward - q[action]) / (n[action]+1)
n[action] += 1
return q, n
```
5. LinUCB算法
```python
import numpy as np
def linucb(x, theta, alpha):
p = np.dot(x, theta) + alpha * np.sqrt(np.dot(np.dot(x, np.linalg.inv(np.dot(x.T, x))), x.T))
action = np.argmax(p)
return action
def linucb_update(x, theta, p, action, reward):
x_a = x[action].reshape(-1, 1)
theta = theta + np.dot(np.linalg.inv(np.dot(x_a, x_a.T)), x_a) * (reward - p[action])
return theta
```
2.2 评估方法Python实现:
1. 各算法的累积reward
```python
def simulate(algo, data, epsilon=None, tau=None, c=None, alpha=None):
q = np.zeros(data.shape[1])
n = np.zeros(data.shape[1])
alpha = np.ones(data.shape[1]) if alpha is None else alpha
beta = np.ones(data.shape[1]) if alpha is None else beta
theta = np.zeros((data.shape[1], 1)) if alpha is None else theta
rewards = []
total_reward = 0
for i in range(data.shape[0]):
if algo == 'softmax':
action = softmax_select(q, tau)
elif algo == 'epsilon-greedy':
action = epsilon_greedy(q, epsilon)
elif algo == 'beta-thompson':
action = beta_thompson(alpha, beta)
elif algo == 'ucb':
action = ucb(q, n, c)
elif algo == 'linucb':
action = linucb(data[i], theta, alpha)
reward = data[i][action]
rewards.append(reward)
total_reward += reward
if algo == 'softmax':
q[action] += (reward - q[action]) / (n[action]+1)
n[action] += 1
elif algo == 'epsilon-greedy':
q[action] += (reward - q[action]) / (n[action]+1)
n[action] += 1
elif algo == 'beta-thompson':
alpha, beta = beta_thompson_update(alpha, beta, action, reward)
elif algo == 'ucb':
q, n = ucb_update(q, n, action, reward)
elif algo == 'linucb':
theta = linucb_update(data[i], theta, np.dot(data[i], theta), action, reward)
return rewards, total_reward
```
2. 各算法每轮的reward
```python
def simulate_round_rewards(algo, data, epsilon=None, tau=None, c=None, alpha=None, num_simulations=10):
round_rewards = np.zeros((num_simulations, data.shape[0]))
for i in range(num_simulations):
rewards, _ = simulate(algo, data, epsilon, tau, c, alpha)
round_rewards[i] = rewards
avg_round_rewards = np.mean(round_rewards, axis=0)
return avg_round_rewards
```
3. 各算法选取最优arm的概率变化情况
```python
def simulate_optimal_selection(algo, data, epsilon=None, tau=None, c=None, alpha=None, num_simulations=10):
optimal_selections = np.zeros((num_simulations, data.shape[0]))
for i in range(num_simulations):
q = np.zeros(data.shape[1])
n = np.zeros(data.shape[1])
alpha = np.ones(data.shape[1]) if alpha is None else alpha
beta = np.ones(data.shape[1]) if alpha is None else beta
theta = np.zeros((data.shape[1], 1)) if alpha is None else theta
optimal_action = np.argmax(data, axis=1)
selections = []
for j in range(data.shape[0]):
if algo == 'softmax':
action = softmax_select(q, tau)
elif algo == 'epsilon-greedy':
action = epsilon_greedy(q, epsilon)
elif algo == 'beta-thompson':
action = beta_thompson(alpha, beta)
elif algo == 'ucb':
action = ucb(q, n, c)
elif algo == 'linucb':
action = linucb(data[j], theta, alpha)
selections.append(action == optimal_action[j])
reward = data[j][action]
if algo == 'softmax':
q[action] += (reward - q[action]) / (n[action]+1)
n[action] += 1
elif algo == 'epsilon-greedy':
q[action] += (reward - q[action]) / (n[action]+1)
n[action] += 1
elif algo == 'beta-thompson':
alpha, beta = beta_thompson_update(alpha, beta, action, reward)
elif algo == 'ucb':
q, n = ucb_update(q, n, action, reward)
elif algo == 'linucb':
theta = linucb_update(data[j], theta, np.dot(data[j], theta), action, reward)
optimal_selections[i] = selections
avg_optimal_selections = np.mean(optimal_selections, axis=0)
return avg_optimal_selections
```
4. 算法选取arm的方差情况进行可视化呈现
```python
import matplotlib.pyplot as plt
def plot_arm_variance(algo, data, epsilon=None, tau=None, c=None, alpha=None, num_simulations=10):
arm_counts = np.zeros(data.shape[1])
for i in range(num_simulations):
q = np.zeros(data.shape[1])
n = np.zeros(data.shape[1])
alpha = np.ones(data.shape[1]) if alpha is None else alpha
beta = np.ones(data.shape[1]) if alpha is None else beta
theta = np.zeros((data.shape[1], 1)) if alpha is None else theta
for j in range(data.shape[0]):
if algo == 'softmax':
action = softmax_select(q, tau)
elif algo == 'epsilon-greedy':
action = epsilon_greedy(q, epsilon)
elif algo == 'beta-thompson':
action = beta_thompson(alpha, beta)
elif algo == 'ucb':
action = ucb(q, n, c)
elif algo == 'linucb':
action = linucb(data[j], theta, alpha)
reward = data[j][action]
if algo == 'softmax':
q[action] += (reward - q[action]) / (n[action]+1)
n[action] += 1
elif algo == 'epsilon-greedy':
q[action] += (reward - q[action]) / (n[action]+1)
n[action] += 1
elif algo == 'beta-thompson':
alpha, beta = beta_thompson_update(alpha, beta, action, reward)
elif algo == 'ucb':
q, n = ucb_update(q, n, action, reward)
elif algo == 'linucb':
theta = linucb_update(data[j], theta, np.dot(data[j], theta), action, reward)
arm_counts[action] += 1
plt.bar(range(data.shape[1]), arm_counts / num_simulations)
plt.xlabel('Arm')
plt.ylabel('Counts')
plt.show()
```
2.3 产生数据集并进行测试
生成数据集:
```python
import numpy as np
np.random.seed(123)
n_rows = 10000
n_cols = 110
data = np.zeros((n_rows, n_cols))
data[:, 0] = np.random.randint(1, 11, size=n_rows)
data[:, 1] = np.random.randint(2, size=n_rows)
for i in range(2, n_cols):
data[:, i] = np.random.choice(100, size=n_rows, p=[0.8] + [0.2/99]*99)
np.savetxt('data.txt', data)
```
测试各算法:
```python
data = np.loadtxt('data.txt')
epsilon = 0.1
tau = 0.1
c = 1
alpha = None
num_simulations = 10
num_rounds = 1000
# Softmax算法
avg_rewards_softmax, total_reward_softmax = simulate('softmax', data, tau=tau, num_simulations=num_simulations)
avg_round_rewards_softmax = simulate_round_rewards('softmax', data, tau=tau, num_simulations=num_simulations)
avg_optimal_selections_softmax = simulate_optimal_selection('softmax', data, tau=tau, num_simulations=num_simulations)
plot_arm_variance('softmax', data, tau=tau, num_simulations=num_simulations)
# Epsilon-Greedy算法
avg_rewards_epsilon_greedy, total_reward_epsilon
阅读全文