python实现推荐系统案例
时间: 2023-12-04 21:53:36 浏览: 77
以下是一个基于Python的简单推荐系统案例,使用了协同过滤算法:
步骤1:收集数据
我们使用MovieLens数据集,该数据集包含大量用户对电影的评分。
步骤2:数据预处理
首先,我们将数据集分成训练集和测试集。然后,我们将数据集转换为字典,其中键是用户ID,值是一个嵌套字典,其中键是电影ID,值是用户对该电影的评分。
步骤3:计算相似度
我们使用皮尔逊相关系数计算用户之间的相似度。
步骤4:预测评分
对于每个用户,我们找到最相似的用户,并使用这些用户的评分来预测该用户尚未评分的电影的评分。
步骤5:评估模型
我们使用均方根误差(RMSE)评估模型的性能。
下面是完整的Python代码:
```python
import math
import operator
# 1. 收集数据
def load_data(file_path):
data = {}
with open(file_path, 'r') as f:
for line in f:
user, movie, rating, _ = line.split('\t')
data.setdefault(user, {})
data[user][movie] = float(rating)
return data
# 2. 数据预处理
def split_data(data, ratio=0.8):
train_data = {}
test_data = {}
for user, movies in data.items():
for movie, rating in movies.items():
if ratio > random.random():
train_data.setdefault(user, {})
train_data[user][movie] = rating
else:
test_data.setdefault(user, {})
test_data[user][movie] = rating
return train_data, test_data
# 3. 计算相似度
def pearson_sim(user1, user2):
shared_items = {}
for item in user1:
if item in user2:
shared_items[item] = 1
n = len(shared_items)
if n == 0:
return 0
sum1 = sum([user1[item] for item in shared_items])
sum2 = sum([user2[item] for item in shared_items])
sum1_sq = sum([pow(user1[item], 2) for item in shared_items])
sum2_sq = sum([pow(user2[item], 2) for item in shared_items])
p_sum = sum([user1[item] * user2[item] for item in shared_items])
num = p_sum - (sum1 * sum2 / n)
den = math.sqrt((sum1_sq - pow(sum1, 2) / n) * (sum2_sq - pow(sum2, 2) / n))
if den == 0:
return 0
return num / den
# 4. 预测评分
def get_recommendations(data, user):
sim_scores = {}
for other in data:
if other == user:
continue
sim_scores[other] = pearson_sim(data[user], data[other])
sorted_scores = sorted(sim_scores.items(), key=operator.itemgetter(1), reverse=True)
top_similar_users = [sim[0] for sim in sorted_scores[:10]]
recommendations = {}
for item in data[user]:
for similar_user in top_similar_users:
if item not in data[similar_user]:
continue
weight = sim_scores[similar_user]
recommendations.setdefault(item, 0)
recommendations[item] += data[similar_user][item] * weight
sorted_recommendations = sorted(recommendations.items(), key=operator.itemgetter(1), reverse=True)
return sorted_recommendations[:10]
# 5. 评估模型
def rmse(predictions, targets):
differences = [predictions[i] - targets[i] for i in range(len(predictions))]
return math.sqrt(sum([diff ** 2 for diff in differences]) / len(differences))
def evaluate_model(train_data, test_data):
predictions = []
targets = []
for user in test_data:
for movie in test_data[user]:
prediction = get_recommendations(train_data, user)[0][1]
target = test_data[user][movie]
predictions.append(prediction)
targets.append(target)
return rmse(predictions, targets)
if __name__ == '__main__':
data = load_data('u.data')
train_data, test_data = split_data(data)
error = evaluate_model(train_data, test_data)
print('RMSE:', error)
```
该代码输出RMSE(均方根误差)作为模型性能的度量。
阅读全文