《Deep & Cross Network for Ad Click Predictions》DCN模型以及python代码
论文的下载地址:https://arxiv.org/pdf/1708.05123.pdf
1 DEEP & CROSS NETWORK (DCN)
1.1 Embedding and Stacking Layer
在进行CTR预估时,特征经过one-hot之后,输入模型的向量会变得比较稀疏,维度非常大。因此开始有人将one-hot后的向量进行embedding,将每个特征向量转换成长度相同的embeddings,也达到了降维的目的。计算公式如下:
where is the embedding vector, is the binary input in the i-th category, and is the corresponding embedding matrix that will be optimized together with other parameters in the network, and , are the embedding size and vocabulary size, respectively.
接下来,将各个embedding进行stack融合,得到作为Cross Network与Deep Network的输入。
1.2 Cross Network
where are column vectors denoting the outputs from the l-th and (l + 1)-th cross layers, respectively; are the weight and bias parameters of the l-th layer. Each cross layer adds back its input after a feature crossing , and the mapping function fits the residual of .
1.3 Deep Network
where are the l-th and (l + 1)-th hidden layer, respectively;l ∈ R^{nl+1}$ are parameters for the l-th deep layer; and f (·) is the ReLU function.
1.4 Combination Layer
where are the outputs from the cross network and deep network, respectively, is the weight vector for the combination layer, and .
损失函数如下:
2 python
import numpy as np
import pandas as pd
class FeatureDictionary():
def __init__(self, trainfile=None, testfile=None, numeric_cols=[], ignore_cols=[], cate_cols=[]):
self.trainfile = trainfile
self.testfile = testfile
self.numeric_cols = numeric_cols
self.ignore_cols = ignore_cols
self.cate_cols = cate_cols
self.gen_feat_dict()
def gen_feat_dict(self):
df = pd.concat([self.trainfile, self.testfile])
self.feat_dict = {}
self.efat_len = {}
tc = 0
for col in df.columns:
if col in self.ignore_cols or col in self.numeric_cols:
continue
else:
us = df[col].unique()
self.feat_dict[col] = dict(zip(us, range(tc, len(us) + tc)))
tc += len(us)
self.feat_dim = tc # 离散数据的特征维度
class DataParser():
def __init__(self, feat_dict):
self.feat_dict = feat_dict
def parse(self, infile=None, df=None, has_label=False):
assert not ((infile is None) and (df is None))
assert not ((infile is not None) and (df is not None))
if infile is None:
dfi = df.copy()
else:
dfi = pd.read_csv(infile)
if has_label:
y = dfi['target'].values.tolist()
dfi.drop(['id', 'target'], axis=1, inplace=True)
else:
ids = dfi['id'].values.tolist()
dfi.drop(['id'], axis=1, inplace=True)
numeric_Xv = dfi[self.feat_dict.numeric_cols].values.tolist() # 连续性特征
dfi.drop(self.feat_dict.numeric_cols, axis=1, inplace=True) # 离散特征
dfv = dfi.copy()
for col in dfi.columns:
if col in self.feat_dict.ignore_cols:
dfi.drop(col, axis=1, inplace=True) # 离散特征
dfv.drop(col, axis=1, inplace=True)
continue
else:
dfi[col] = dfi[col].map(self.feat_dict.feat_dict[col])
dfv[col] = 1.
cate_Xi = dfi.values.tolist() # 每个样本的特征索引
cate_Xv = dfv.values.tolist() # 每个样本对应的特征值
if has_label:
return cate_Xi, cate_Xv, numeric_Xv, y
else:
return cate_Xi, cate_Xv, numeric_Xv, ids
import numpy as np
import tensorflow as tf
from sklearn.metrics import roc_auc_score
import time
class DCN():
def __init__(self, cate_feature_size, field_size,numeric_feature_size,
embedding_size=8,
deep_layers=[32, 32], dropout_deep=[0.5, 0.5, 0.5],
deep_layers_activation=tf.nn.relu,
epoch=10, batch_size=256,
learning_rate=0.001, optimizer_type="adam",
batch_norm=0, batch_norm_decay=0.995,
verbose=False, random_seed=2016,
loss_type="logloss", eval_metric=roc_auc_score,
l2_reg=0.0, greater_is_better=True,cross_layer_num=3):
assert loss_type in ["logloss", "mse"], \
"loss_type can be either 'logloss' for classification task or 'mse' for regression task"
self.cate_feature_size = cate_feature_size
self.numeric_feature_size = numeric_feature_size
self.field_size = field_size
self.embedding_size = embedding_size
self.total_size = self.field_size * self.embedding_size + self.numeric_feature_size
self.deep_layers = deep_layers
self.cross_layer_num = cross_layer_num
self.dropout_dep = dropout_deep
self.deep_layers_activation = deep_layers_activation
self.l2_reg = l2_reg
self.epoch = epoch
self.batch_size = batch_size
self.learning_rate = learning_rate
self.optimizer_type = optimizer_type
self.batch_norm = batch_norm
self.batch_norm_decay = batch_norm_decay
self.verbose = verbose
self.random_seed = random_seed
self.loss_type = loss_type
self.eval_metric = eval_metric
self.greater_is_better = greater_is_better
self.train_result,self.valid_result = [],[]
self._init_graph()
def _init_graph(self):
self.graph = tf.Graph()
with self.graph.as_default():
tf.set_random_seed(self.random_seed)
self.feat_index = tf.placeholder(tf.int32,
shape=[None,None],
name='feat_index')
self.feat_value = tf.placeholder(tf.float32,
shape=[None,None],
name='feat_value')
self.numeric_value = tf.placeholder(tf.float32,[None,None],name='num_value')
self.label = tf.placeholder(tf.float32,shape=[None,1],name='label')
self.dropout_keep_deep = tf.placeholder(tf.float32,shape=[None],name='dropout_deep_deep')
self.train_phase = tf.placeholder(tf.bool,name='train_phase')
self.weights = self._initialize_weights()
# model
self.embeddings = tf.nn.embedding_lookup(self.weights['feature_embeddings'],self.feat_index) # N * F * K
feat_value = tf.reshape(self.feat_value,shape=[-1,self.field_size,1])
self.embeddings = tf.multiply(self.embeddings,feat_value)
self.x0 = tf.concat([self.numeric_value,
tf.reshape(self.embeddings,shape=[-1,self.field_size * self.embedding_size])]
,axis=1)
# deep part
self.y_deep = tf.nn.dropout(self.x0,self.dropout_keep_deep[0])
for i in range(0,len(self.deep_layers)):
self.y_deep = tf.add(tf.matmul(self.y_deep,self.weights["deep_layer_%d" %i]), self.weights["deep_bias_%d"%i])
self.y_deep = self.deep_layers_activation(self.y_deep)
self.y_deep = tf.nn.dropout(self.y_deep,self.dropout_keep_deep[i+1])
# cross_part
self._x0 = tf.reshape(self.x0, (-1, self.total_size, 1))
x_l = self._x0
for l in range(self.cross_layer_num):
x_l = tf.tensordot(tf.matmul(self._x0, x_l, transpose_b=True),
self.weights["cross_layer_%d" % l],1) + self.weights["cross_bias_%d" % l] + x_l
self.cross_network_out = tf.reshape(x_l, (-1, self.total_size))
# concat_part
concat_input = tf.concat([self.cross_network_out, self.y_deep], axis=1)
self.out = tf.add(tf.matmul(concat_input,self.weights['concat_projection']),self.weights['concat_bias'])
# loss
if self.loss_type == "logloss":
self.out = tf.nn.sigmoid(self.out)
self.loss = tf.losses.log_loss(self.label, self.out)
elif self.loss_type == "mse":
self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out))
# l2 regularization on weights
if self.l2_reg > 0:
self.loss += tf.contrib.layers.l2_regularizer(
self.l2_reg)(self.weights["concat_projection"])
for i in range(len(self.deep_layers)):
self.loss += tf.contrib.layers.l2_regularizer(
self.l2_reg)(self.weights["deep_layer_%d" % i])
for i in range(self.cross_layer_num):
self.loss += tf.contrib.layers.l2_regularizer(
self.l2_reg)(self.weights["cross_layer_%d" % i])
if self.optimizer_type == "adam":
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999,
epsilon=1e-8).minimize(self.loss)
elif self.optimizer_type == "adagrad":
self.optimizer = tf.train.AdagradOptimizer(learning_rate=self.learning_rate,
initial_accumulator_value=1e-8).minimize(self.loss)
elif self.optimizer_type == "gd":
self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
elif self.optimizer_type == "momentum":
self.optimizer = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=0.95).minimize(
self.loss)
#init
self.saver = tf.train.Saver()
init = tf.global_variables_initializer()
self.sess = tf.Session()
self.sess.run(init)
# number of params
total_parameters = 0
for variable in self.weights.values():
shape = variable.get_shape()
variable_parameters = 1
for dim in shape:
variable_parameters *= dim.value
total_parameters += variable_parameters
if self.verbose > 0:
print("#params: %d" % total_parameters)
def _initialize_weights(self):
weights = dict()
#embeddings
weights['feature_embeddings'] = tf.Variable(
tf.random_normal([self.cate_feature_size,self.embedding_size],0.0,0.01),
name='feature_embeddings')
weights['feature_bias'] = tf.Variable(tf.random_normal([self.cate_feature_size,1],0.0,1.0),name='feature_bias')
#deep layers
num_layer = len(self.deep_layers)
glorot = np.sqrt(2.0/(self.total_size + self.deep_layers[0]))
weights['deep_layer_0'] = tf.Variable(
np.random.normal(loc=0,scale=glorot,size=(self.total_size,self.deep_layers[0])),dtype=np.float32
)
weights['deep_bias_0'] = tf.Variable(
np.random.normal(loc=0,scale=glorot,size=(1,self.deep_layers[0])),dtype=np.float32
)
for i in range(1,num_layer):
glorot = np.sqrt(2.0 / (self.deep_layers[i - 1] + self.deep_layers[i]))
weights["deep_layer_%d" % i] = tf.Variable(
np.random.normal(loc=0, scale=glorot, size=(self.deep_layers[i - 1], self.deep_layers[i])),
dtype=np.float32) # layers[i-1] * layers[i]
weights["deep_bias_%d" % i] = tf.Variable(
np.random.normal(loc=0, scale=glorot, size=(1, self.deep_layers[i])),
dtype=np.float32) # 1 * layer[i]
for i in range(self.cross_layer_num):
weights["cross_layer_%d" % i] = tf.Variable(
np.random.normal(loc=0, scale=glorot, size=(self.total_size,1)),
dtype=np.float32)
weights["cross_bias_%d" % i] = tf.Variable(
np.random.normal(loc=0, scale=glorot, size=(self.total_size,1)),
dtype=np.float32) # 1 * layer[i]
# final concat projection layer
input_size = self.total_size + self.deep_layers[-1]
glorot = np.sqrt(2.0/(input_size + 1))
weights['concat_projection'] = tf.Variable(np.random.normal(loc=0,scale=glorot,size=(input_size,1)),dtype=np.float32)
weights['concat_bias'] = tf.Variable(tf.constant(0.01),dtype=np.float32)
return weights
def get_batch(self,Xi,Xv,Xv2,y,batch_size,index):
start = index * batch_size
end = (index + 1) * batch_size
end = end if end < len(y) else len(y)
return Xi[start:end],Xv[start:end],Xv2[start:end],[[y_] for y_ in y[start:end]]
# shuffle three lists simutaneously
def shuffle_in_unison_scary(self, a, b, c,d):
rng_state = np.random.get_state()
np.random.shuffle(a)
np.random.set_state(rng_state)
np.random.shuffle(b)
np.random.set_state(rng_state)
np.random.shuffle(c)
np.random.set_state(rng_state)
np.random.shuffle(d)
def predict(self, Xi, Xv,Xv2,y):
"""
:param Xi: list of list of feature indices of each sample in the dataset
:param Xv: list of list of feature values of each sample in the dataset
:return: predicted probability of each sample
"""
# dummy y
feed_dict = {self.feat_index: Xi,
self.feat_value: Xv,
self.numeric_value: Xv2,
self.label: y,
self.dropout_keep_deep: [1.0] * len(self.dropout_dep),
self.train_phase: True}
loss = self.sess.run([self.loss], feed_dict=feed_dict)
return loss
def fit_on_batch(self,Xi,Xv,Xv2,y):
feed_dict = {self.feat_index:Xi,
self.feat_value:Xv,
self.numeric_value:Xv2,
self.label:y,
self.dropout_keep_deep:self.dropout_dep,
self.train_phase:True}
loss,opt = self.sess.run([self.loss,self.optimizer],feed_dict=feed_dict)
return loss
def fit(self, cate_Xi_train, cate_Xv_train,numeric_Xv_train, y_train,
cate_Xi_valid=None, cate_Xv_valid=None, numeric_Xv_valid=None,y_valid=None,
early_stopping=False, refit=False):
"""
:param Xi_train: [[ind1_1, ind1_2, ...], [ind2_1, ind2_2, ...], ..., [indi_1, indi_2, ..., indi_j, ...], ...]
indi_j is the feature index of feature field j of sample i in the training set
:param Xv_train: [[val1_1, val1_2, ...], [val2_1, val2_2, ...], ..., [vali_1, vali_2, ..., vali_j, ...], ...]
vali_j is the feature value of feature field j of sample i in the training set
vali_j can be either binary (1/0, for binary/categorical features) or float (e.g., 10.24, for numerical features)
:param y_train: label of each sample in the training set
:param Xi_valid: list of list of feature indices of each sample in the validation set
:param Xv_valid: list of list of feature values of each sample in the validation set
:param y_valid: label of each sample in the validation set
:param early_stopping: perform early stopping or not
:param refit: refit the model on the train+valid dataset or not
:return: None
"""
print(len(cate_Xi_train))
print(len(cate_Xv_train))
print(len(numeric_Xv_train))
print(len(y_train))
has_valid = cate_Xv_valid is not None
for epoch in range(self.epoch):
t1 = time()
self.shuffle_in_unison_scary(cate_Xi_train, cate_Xv_train,numeric_Xv_train, y_train)
total_batch = int(len(y_train) / self.batch_size)
for i in range(total_batch):
cate_Xi_batch, cate_Xv_batch,numeric_Xv_batch, y_batch = self.get_batch(cate_Xi_train, cate_Xv_train, numeric_Xv_train,y_train, self.batch_size, i)
self.fit_on_batch(cate_Xi_batch, cate_Xv_batch,numeric_Xv_batch, y_batch)
if has_valid:
y_valid = np.array(y_valid).reshape((-1,1))
loss = self.predict(cate_Xi_valid, cate_Xv_valid, numeric_Xv_valid, y_valid)
print("epoch",epoch,"loss",loss)
TRAIN_FILE = "./data/train.csv"
TEST_FILE = "./data/test.csv"
SUB_DIR = "output"
NUM_SPLITS = 3
RANDOM_SEED = 2017
# types of columns of the dataset dataframe
CATEGORICAL_COLS = [
'ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat',
'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat',
'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat',
'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat',
'ps_car_10_cat', 'ps_car_11_cat',
]
NUMERIC_COLS = [
# # binary
# "ps_ind_06_bin", "ps_ind_07_bin", "ps_ind_08_bin",
# "ps_ind_09_bin", "ps_ind_10_bin", "ps_ind_11_bin",
# "ps_ind_12_bin", "ps_ind_13_bin", "ps_ind_16_bin",
# "ps_ind_17_bin", "ps_ind_18_bin",
# "ps_calc_15_bin", "ps_calc_16_bin", "ps_calc_17_bin",
# "ps_calc_18_bin", "ps_calc_19_bin", "ps_calc_20_bin",
# numeric
"ps_reg_01", "ps_reg_02", "ps_reg_03",
"ps_car_12", "ps_car_13", "ps_car_14", "ps_car_15",
# feature engineering
"missing_feat", "ps_car_13_x_ps_reg_03",
]
IGNORE_COLS = [
"id", "target",
"ps_calc_01", "ps_calc_02", "ps_calc_03", "ps_calc_04",
"ps_calc_05", "ps_calc_06", "ps_calc_07", "ps_calc_08",
"ps_calc_09", "ps_calc_10", "ps_calc_11", "ps_calc_12",
"ps_calc_13", "ps_calc_14",
"ps_calc_15_bin", "ps_calc_16_bin", "ps_calc_17_bin",
"ps_calc_18_bin", "ps_calc_19_bin", "ps_calc_20_bin"
]
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
def load_data():
dfTrain = pd.read_csv(TRAIN_FILE)
dfTest = pd.read_csv(TEST_FILE)
def preprocess(df):
cols = [c for c in df.columns if c not in ["id", "target"]]
df["missing_feat"] = np.sum((df[cols] == -1).values, axis=1)
df["ps_car_13_x_ps_reg_03"] = df["ps_car_13"] * df["ps_reg_03"]
return df
dfTrain = preprocess(dfTrain)
dfTest = preprocess(dfTest)
cols = [c for c in dfTrain.columns if c not in ["id", "target"]]
cols = [c for c in cols if (not c in IGNORE_COLS)]
X_train = dfTrain[cols].values
y_train = dfTrain["target"].values
X_test = dfTest[cols].values
ids_test = dfTest["id"].values
return dfTrain, dfTest, X_train, y_train, X_test, ids_test,
def run_base_model_dcn(dfTrain, dfTest, folds, dcn_params):
fd = FeatureDictionary(dfTrain,dfTest,numeric_cols=NUMERIC_COLS,
ignore_cols=IGNORE_COLS,
cate_cols = CATEGORICAL_COLS)
print(fd.feat_dim)
print(fd.feat_dict)
data_parser = DataParser(feat_dict=fd)
cate_Xi_train, cate_Xv_train, numeric_Xv_train,y_train = data_parser.parse(df=dfTrain, has_label=True)
cate_Xi_test, cate_Xv_test, numeric_Xv_test,ids_test = data_parser.parse(df=dfTest)
dcn_params["cate_feature_size"] = fd.feat_dim
dcn_params["field_size"] = len(cate_Xi_train[0])
dcn_params['numeric_feature_size'] = len(NUMERIC_COLS)
_get = lambda x, l: [x[i] for i in l]
for i, (train_idx, valid_idx) in enumerate(folds):
cate_Xi_train_, cate_Xv_train_, numeric_Xv_train_,y_train_ = _get(cate_Xi_train, train_idx), _get(cate_Xv_train, train_idx),_get(numeric_Xv_train, train_idx), _get(y_train, train_idx)
cate_Xi_valid_, cate_Xv_valid_, numeric_Xv_valid_,y_valid_ = _get(cate_Xi_train, valid_idx), _get(cate_Xv_train, valid_idx),_get(numeric_Xv_train, valid_idx), _get(y_train, valid_idx)
dcn = DCN(**dcn_params)
dcn.fit(cate_Xi_train_, cate_Xv_train_, numeric_Xv_train_,y_train_, cate_Xi_valid_, cate_Xv_valid_, numeric_Xv_valid_,y_valid_)
#dfTrain = pd.read_csv(config.TRAIN_FILE,nrows=10000,index_col=None).to_csv(config.TRAIN_FILE,index=False)
#dfTest = pd.read_csv(config.TEST_FILE,nrows=2000,index_col=None).to_csv(config.TEST_FILE,index=False)
dfTrain, dfTest, X_train, y_train, X_test, ids_test = load_data()
print('load_data_over')
folds = list(StratifiedKFold(n_splits=NUM_SPLITS, shuffle=True, random_state=RANDOM_SEED).split(X_train, y_train))
print('process_data_over')
dcn_params = {
"embedding_size": 8,
"deep_layers": [32, 32],
"dropout_deep": [0.5, 0.5, 0.5],
"deep_layers_activation": tf.nn.relu,
"epoch": 30,
"batch_size": 1024,
"learning_rate": 0.001,
"optimizer_type": "adam",
"batch_norm": 1,
"batch_norm_decay": 0.995,
"l2_reg": 0.01,
"verbose": True,
"random_seed": RANDOM_SEED,
"cross_layer_num":3
}
print('start train')
run_base_model_dcn(dfTrain, dfTest, folds, dcn_params)