基于TensorFlow的FM实现

520jefferson 2021-07-13

展开全文

在之前的文章「从矩阵分解到FM的演进、FM如何用于召回和排序以及实现说明」中介绍了FM算法的演化史，主要是从协同过滤CF到矩阵分解MF，再到线性模型LR，最后介绍了FM用于召回和精排的情况。这一篇文章主要是FM的代码实现，以MovielLens数据集为例进行说明。

MovielLens数据集（ml-100k）包含了10万条评分记录，其中涉及了943个用户和1682个item，这里使用的是<user, item, rate>这样的数据形式。

我这里导入的是tf2.x的环境，但是是用tf1.x写的，因为工作中还是用的1x比较多。那么如何在tf2.x的环境中应用1.x的功能呢？

tf.compat.v1.disable_eager_execution()

另外一个需要注意的点是，保存模型需要在每个epoch都要进行保存

for epoch in range(epochs):
    ... ...
    # 保存模型
    self.saver.save(self.sess, '{}/tf_with_1x'.format(self.modelpath))

OK，看代码实现，首先定义工具类，主要包含了三个部分的功能：

加载数据

   def load_dataset(self, train_path, test_path, mode):
        cols = ['user', 'item', 'rating', 'timestamp']
        train = pd.read_csv(train_path, delimiter='\t', names=cols)
        test = pd.read_csv(test_path, delimiter='\t', names=cols)
        print(train.user.values)
        X_train, ix = self.vectorize_dic({'users': train.user.values, 'items': train.item.values})
        X_test, ix = self.vectorize_dic({'users': test.user.values, 'items': test.item.values}, ix, X_train.shape[1])
        y1 = train.rating.values
        y_train = np.zeros((len(y1), 1))
        y2 = test.rating.values
        y_test = np.zeros((len(y2), 1))
        if mode == 'regression':
            y_train = y1.copy()
            y_test = y2.copy()
        elif mode == 'classification':
            y_train[np.where(y1 == 5)] = 1
            y_train[np.where(y1 < 5)] = -1
            y_test[np.where(y2 == 5)] = 1
            y_test[np.where(y2 < 5)] = -1
        return X_train, y_train, X_test, y_test

创建一个scipy csr matrix

    def vectorize_dic(self, dic, ix=None, p=None):
        '''
        Creates a scipy csr matrix from a list of lists (each inner list is a set of values corresponding to a feature)
        :param dic: dictionay of feature lists. Keys are the name of features
        :param ix:  index generator(default None)
        :param p: dimension of feature space (number of columns in the sparse matrix)
        :return:
        '''

        if (ix == None):
            d = count(0)
            ix = defaultdict(lambda: next(d))
        # 样本数
        n = len(list(dic.values())[0])
        # 特征数
        g = len(list(dic.keys()))
        # 生成矩阵拆平之后的总长度
        nz = n * g

        col_ix = np.empty(nz, dtype=int)

        i = 0
        for k, lis in dic.items():
            # 从i位置开始，间隔 g
            col_ix[i::g] = [ix[str(el) + str(k)] for el in lis]
            i += 1
            
        # np.repeat(np.arange(0, 10), 3)
        # array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9])
        row_ix = np.repeat(np.arange(0, n), g)
        data = np.ones(nz)

        if (p == None):
            p = len(ix)
        ixx = np.where(col_ix < p)
        # 关于矩阵压缩 csr.csr_matrix参考：https://cloud.tencent.com/developer/article/1099721
        return csr.csr_matrix((data[ixx], (row_ix[ixx], col_ix[ixx])), shape=(n, p)), ix

batch数据的返回

  def get_batcher(self, X_, y_=None, batch_size=None):
        n_samples = X_.shape[0]

        if batch_size is None:
            batch_size = n_samples

        for i in range(0, n_samples, batch_size):
            upper_bound = min(i + batch_size, n_samples)
            ret_x = X_[i:upper_bound]
            ret_y = None
            if y_ is not None:
                ret_y = y_[i:i + batch_size]
                yield (ret_x, ret_y)

然后是定义FM模型，FM模型主要包含了几个部分：

加载数据

    def load_data(self):
      self.X_train, self.y_train, self.X_test, self.y_test = self.util.load_dataset(self.trainPath, self.testPath, self.mode)
      self.X_train = self.X_train.todense()
      self.X_test = self.X_test.todense()
      print('Train data shape: ', self.X_train.shape)
      print(self.X_train[:3])
      print('Test data shape: ', self.X_test.shape)
      print(self.X_test[:3])

创建模型

    def build_model(self):
      self.row_num, self.col_num = self.X_train.shape

      # design matrix
      self.X = tf.compat.v1.placeholder('float', shape=[None, self.col_num])
      # target vector
      self.y = tf.compat.v1.placeholder('float', shape=[None, 1])

      # 偏置和权重
      w0 = tf.Variable(tf.zeros([1]))
      W = tf.Variable(tf.zeros([self.col_num]))

      # 初始化向量矩阵
      self.V = tf.Variable(tf.random.normal([self.k, self.col_num], stddev=0.01))

      # 创建结果值
      y_hat = tf.Variable(tf.zeros([self.row_num, 1]))

      # 线性部分
      linear_terms = tf.add(w0, tf.reduce_sum(tf.multiply(W, self.X), 1, keepdims=True))
      # 特征交叉部分 参考 https://mp.weixin.qq.com/s/mJpNwEDGqS7u-vtZ54zV6A 推导过程
      pair_interaction = (tf.multiply(0.5,
                                       tf.reduce_sum(
                                           tf.subtract(
                                               tf.pow(tf.matmul(self.X, tf.transpose(self.V)), 2),
                                               tf.matmul(tf.pow(self.X, 2), tf.transpose(tf.pow(self.V, 2)))
                                           ),
                                           1, keepdims=True)))
      self.y_hat = tf.add(linear_terms, pair_interaction)

      # lambda_w = tf.constant(0.001, name='lambda_w')
      # lambda_v = tf.constant(0.001, name='lambda_v')
      lambda_w = tf.constant(0.00, name='lambda_w')
      lambda_v = tf.constant(0.00, name='lambda_v')
      l2_norm = tf.add(
          tf.reduce_sum(tf.multiply(lambda_w, tf.pow(W, 2))),
          tf.reduce_sum(tf.multiply(lambda_v, tf.pow(self.V, 2)))
      )

      if self.mode == 'regression':
          self.error = tf.reduce_mean(tf.square(tf.subtract(self.y, self.y_hat)))
          self.loss = tf.add(self.error, l2_norm)
      elif self.mode == 'classification':
          print(self.y.get_shape().as_list())
          print(self.y_hat.get_shape().as_list())
          self.error = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=self.y, logits=self.y_hat))
          self.loss = tf.add(self.error, l2_norm)
          print(self.loss.get_shape().as_list())
          print(l2_norm.get_shape().as_list())

      # self.optimizer = tf.train.AdamOptimizer(beta1=0.9, beta2=0.5).minimize(self.loss)
      self.optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=0.001).minimize(self.loss)

模型训练

    def train(self):
      epochs = 5
      batch_size = 256
      self.init = tf.compat.v1.global_variables_initializer()
      self.sess = tf.compat.v1.Session()
      self.saver = tf.compat.v1.train.Saver()  # 在构建网络后使用
      self.sess.run(self.init)
      for epoch in range(epochs):
          perm = np.random.permutation(self.X_train.shape[0])
          cnt = 0
          for batchX, batchY in self.util.get_batcher(self.X_train[perm], self.y_train[perm], batch_size):
              _, loss = self.sess.run((self.optimizer, self.loss), feed_dict={self.X: batchX.reshape(-1, self.col_num), self.y: batchY.reshape(-1, 1)})
              if cnt == 1:
                  print('Epoch: %d, Loss: %.3f' % (epoch + 1, loss))
              cnt += 1
          # 保存模型
          self.saver.save(self.sess, '{}/tf_with_1x'.format(self.modelpath))

模型评估

    def evaluate(self):
      # 加载模型
      with tf.compat.v1.Session() as sess:
          sess.run(self.init)
          self.saver.restore(sess, '{}/tf_with_1x'.format(self.modelpath))
          print('模型加载成功 ...')
          if self.mode == 'regression':
              errors = []
              for batchX, batchY in self.util.get_batcher(self.X_test, self.y_test):
                  errors.append(sess.run(self.error, feed_dict={self.X: batchX.reshape(-1, self.col_num), self.y: batchY.reshape(-1, 1)}))
              RMSE = np.sqrt(np.array(errors).mean())
              print('RMSE: ', RMSE)
          elif self.mode == 'classification':
              pred = np.zeros((len(self.X_test), 1))
              for batchX, batchY in self.util.get_batcher(self.X_test, self.y_test):
                  logits = sess.run(self.y_hat, feed_dict={self.X: batchX.reshape(-1, self.col_num), self.y: batchY.reshape(-1, 1)})
                  y_hat = self.util.sigmoid(logits)
                  pred[np.where(y_hat > 0.5)] = 1
                  pred[np.where(y_hat < 0.5)] = -1
              print('Accuracy: ', np.mean(self.y_test == pred))
      sess.close()