物体推荐算法(代码修正版)

1. 原理

  
   协同过滤算法一般需要以下几个步骤

  • 收集用户偏好
  • 找到相似的用户或物品
  • 计算推荐

协同过滤算法主要有基于用户的协同过滤和基于物品的协同过滤。

   user_based_CF就是把与你有相同爱好的用户所喜欢的物品(并且你还没有评过分)推荐给你

物体推荐算法(代码修正版)

   item_based_CF则与之相反,把和你之前喜欢的物品近似的物品推荐给你

物体推荐算法(代码修正版)

2. 代码

数据下载:https://grouplens.org/datasets/movielens/

ml-100k文件

import numpy as np

def load_data(path):
    data = []
    '''
    f = open(filename, 'rb').read()
    data = f.decode().split('\n')
    '''
    with open(path, 'rb') as file:
        for line in file:
            (user_id, moive_id,rating,time_stamp) = line.strip().split('\t'.encode(encoding ="utf-8"))
            data.append([user_id,moive_id,rating])
    data = np.array(data).astype(np.uint16)
    return data

class  item_based_cf:
    def __init__(self,train_data):
        self.train_data = np.array(train_data)
        print("the input data size is ",self.train_data.shape)
        self.movie_user = {}
        self.user_movie = {}
        self.avg = np.mean(self.train_data[:,2])  # 不知道干啥的时候用
        for i in range(self.train_data.shape[0]):
            # 逐行,读取数据
            user_id = self.train_data[i][0]
            moive_id = self.train_data[i][1]
            rating = self.train_data[i][2]
            # 把需要考虑的情况全部添加键值对
            # setdefualt添加键值对,下面内部继续赋值
            self.movie_user.setdefault(moive_id,{})
            self.user_movie.setdefault(user_id,{})
            self.movie_user[moive_id][user_id]=rating
            self.user_movie[user_id][moive_id]=rating
        self.similarity={}

    def sim_cal(self,m1,m2):
        # 比较两者相似性
        self.similarity.setdefault(m1,{})
        self.similarity[m1][m2] = -1     #等价于self.similarity[m1].setdefault(m2,-1)  两层字典{m1:{m2:-1}}
        self.similarity.setdefault(m2,{})
        self.similarity[m2][m1] = -1

        # movie_user中间。m1,m2类别初始化,已经有了的类别不会改变原来的键值对
        self.movie_user.setdefault(m1,{})
        self.movie_user.setdefault(m2,{})

        '''
        老用户直接返回
        '''
        if self.similarity[m1][m2] != -1:
            return self.similarity[m1][m2]

        # 都评价过m1,m2的用户。格式 {userid1}
        su = {}
        for user in self.movie_user[m1]:
            if user in self.movie_user[m2]:
                su[user]=1

        # 没有交集,返回 1
        n = len(su)
        if (n == 0):
            self.similarity[m1][m2] = 1
            self.similarity[m2][m1] = 1
            return 1

        s1 = np.array([self.movie_user[m1][u] for u in su])  #返回一大堆rating
        s2 = np.array([self.movie_user[m2][u] for u in su])
        # 求和
        sum1 = np.sum(s1)
        sum2 = np.sum(s2)
        #求平方和
        sum1Sq = np.sum(s1**2)
        sum2Sq = np.sum(s2**2)

        #计算皮尔逊相关系数
        pSum = np.sum(s1*s2)  # 内积
        num = pSum-(sum1*sum2/n)
        den = np.sqrt((sum1Sq-sum1**2/n)*(sum2Sq-sum2**2/n))  # 两者方差积——分母

        if den==0:  # 一般不可能发生
            self.similarity[m1][m2]=0
            self.similarity[m2][m1]=0
            return 0

        self.similarity[m1][m2] = num/den
        self.similarity[m2][m1] = num/den
        return num/den

    '''
    Predict whether a user like this item, I think this way is too sample. But it's a way.
    '''
    def predict(self,user_id, moive_id):
        '''initialization'''
        sim_accumulate=0.0
        rat_acc=0.0
        for item in self.user_movie[user_id]:
            # 用户喜欢物品的逐个物体运算
            sim=self.sim_cal(item, moive_id)
            if sim<0:
                continue
            '''renew'''
            rat_acc += sim * self.user_movie[user_id][item]  # 关系大小和原来物体乘积
            sim_accumulate += sim
        '''no same user rated,return average rates of the data'''
        if sim_accumulate == 0:
            return  self.avg
        return rat_acc/sim_accumulate  # At lastlikehood average rates

    # 修改文件格式,把结果整理形式
    def test(self, test_X):
        test_X = np.array(test_X)
        output = []
        sums = 0
        print("the test data size is ", test_X.shape)
        for i in range(test_X.shape[0]):
            pre = self.predict(test_X[i][0], test_X[i][1])
            output.append(pre)
            sums += (pre-test_X[i][2])**2
        rmse = np.sqrt(sums/test_X.shape[0])
        print("the rmse on test data is ", rmse)
        return output


train_data = load_data('/home/ubuntu/音乐/Collaborative Filtering/ml-100k/u1.base')
test_data = load_data('/home/ubuntu/音乐/Collaborative Filtering/ml-100k/u1.test')

ibc = item_based_cf(train_data)
pred = ibc.test(test_data)