物体推荐算法(代码修正版)
1. 原理
协同过滤算法一般需要以下几个步骤
- 收集用户偏好
- 找到相似的用户或物品
- 计算推荐
协同过滤算法主要有基于用户的协同过滤和基于物品的协同过滤。
user_based_CF就是把与你有相同爱好的用户所喜欢的物品(并且你还没有评过分)推荐给你
item_based_CF则与之相反,把和你之前喜欢的物品近似的物品推荐给你
2. 代码
数据下载:https://grouplens.org/datasets/movielens/
ml-100k文件
import numpy as np def load_data(path): data = [] ''' f = open(filename, 'rb').read() data = f.decode().split('\n') ''' with open(path, 'rb') as file: for line in file: (user_id, moive_id,rating,time_stamp) = line.strip().split('\t'.encode(encoding ="utf-8")) data.append([user_id,moive_id,rating]) data = np.array(data).astype(np.uint16) return data class item_based_cf: def __init__(self,train_data): self.train_data = np.array(train_data) print("the input data size is ",self.train_data.shape) self.movie_user = {} self.user_movie = {} self.avg = np.mean(self.train_data[:,2]) # 不知道干啥的时候用 for i in range(self.train_data.shape[0]): # 逐行,读取数据 user_id = self.train_data[i][0] moive_id = self.train_data[i][1] rating = self.train_data[i][2] # 把需要考虑的情况全部添加键值对 # setdefualt添加键值对,下面内部继续赋值 self.movie_user.setdefault(moive_id,{}) self.user_movie.setdefault(user_id,{}) self.movie_user[moive_id][user_id]=rating self.user_movie[user_id][moive_id]=rating self.similarity={} def sim_cal(self,m1,m2): # 比较两者相似性 self.similarity.setdefault(m1,{}) self.similarity[m1][m2] = -1 #等价于self.similarity[m1].setdefault(m2,-1) 两层字典{m1:{m2:-1}} self.similarity.setdefault(m2,{}) self.similarity[m2][m1] = -1 # movie_user中间。m1,m2类别初始化,已经有了的类别不会改变原来的键值对 self.movie_user.setdefault(m1,{}) self.movie_user.setdefault(m2,{}) ''' 老用户直接返回 ''' if self.similarity[m1][m2] != -1: return self.similarity[m1][m2] # 都评价过m1,m2的用户。格式 {userid:1} su = {} for user in self.movie_user[m1]: if user in self.movie_user[m2]: su[user]=1 # 没有交集,返回 1 n = len(su) if (n == 0): self.similarity[m1][m2] = 1 self.similarity[m2][m1] = 1 return 1 s1 = np.array([self.movie_user[m1][u] for u in su]) #返回一大堆rating s2 = np.array([self.movie_user[m2][u] for u in su]) # 求和 sum1 = np.sum(s1) sum2 = np.sum(s2) #求平方和 sum1Sq = np.sum(s1**2) sum2Sq = np.sum(s2**2) #计算皮尔逊相关系数 pSum = np.sum(s1*s2) # 内积 num = pSum-(sum1*sum2/n) den = np.sqrt((sum1Sq-sum1**2/n)*(sum2Sq-sum2**2/n)) # 两者方差积——分母 if den==0: # 一般不可能发生 self.similarity[m1][m2]=0 self.similarity[m2][m1]=0 return 0 self.similarity[m1][m2] = num/den self.similarity[m2][m1] = num/den return num/den ''' Predict whether a user like this item, I think this way is too sample. But it's a way. ''' def predict(self,user_id, moive_id): '''initialization''' sim_accumulate=0.0 rat_acc=0.0 for item in self.user_movie[user_id]: # 用户喜欢物品的逐个物体运算 sim=self.sim_cal(item, moive_id) if sim<0: continue '''renew''' rat_acc += sim * self.user_movie[user_id][item] # 关系大小和原来物体乘积 sim_accumulate += sim '''no same user rated,return average rates of the data''' if sim_accumulate == 0: return self.avg return rat_acc/sim_accumulate # At last,likehood average rates # 修改文件格式,把结果整理形式 def test(self, test_X): test_X = np.array(test_X) output = [] sums = 0 print("the test data size is ", test_X.shape) for i in range(test_X.shape[0]): pre = self.predict(test_X[i][0], test_X[i][1]) output.append(pre) sums += (pre-test_X[i][2])**2 rmse = np.sqrt(sums/test_X.shape[0]) print("the rmse on test data is ", rmse) return output train_data = load_data('/home/ubuntu/音乐/Collaborative Filtering/ml-100k/u1.base') test_data = load_data('/home/ubuntu/音乐/Collaborative Filtering/ml-100k/u1.test') ibc = item_based_cf(train_data) pred = ibc.test(test_data)