深度学习-*-TextCNN对头条文章进行分类
数据集–头条文章
根据晚上搜集的数据资源-github地址:头条数据集
处理该数据集的策略是 分词,抽词等,主要是通过原爬取者抽取的关键字和我从标题中抽取的关键字结合作为分类的特征词。
模型
参考的是网上的教程:
import tensorflow as tf
class TextCNN(object):
"""
TextCNN model
"""
def __init__(self,sequence_length,num_classes,vocab_size,embedding_size,filter_sizes,num_filters,l2_reg_lambda=0.0):
self.input_x = tf.placeholder(tf.int32,shape=[None,sequence_length],name='input_x')
self.input_y = tf.placeholder(tf.float32,shape=[None,num_classes],name='input_y')
self.dropout_keep_prob = tf.placeholder(tf.float32,name='dropout_keep_prob')
l2_loss = tf.constant(0.0)
with tf.name_scope('embedding'):
self.W = tf.Variable(tf.random_uniform([vocab_size,embedding_size],-1.0,1.0,dtype=tf.float32),name='W')
self.embedded_chars = tf.nn.embedding_lookup(self.W,self.input_x)
self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars,-1)
pooled_outputs = []
for i,filter_size in enumerate(filter_sizes):
with tf.name_scope('conv-maxpool-%s'% filter_size):
filter_shape = [filter_size,embedding_size,1,num_filters]
W = tf.Variable(tf.truncated_normal(filter_shape,stddev=0.1,dtype=tf.float32),name='W')
b = tf.Variable(tf.constant(0.1,shape=[num_filters],dtype=tf.float32),name='b')
conv = tf.nn.conv2d(self.embedded_chars_expanded,W,strides=[1,1,1,1],padding="VALID",name='conv')
h = tf.nn.relu(tf.nn.bias_add(conv,b),name='relu')
pooled = tf.nn.max_pool(h,ksize=[1,sequence_length-filter_size+1,1,1],strides=[1,1,1,1],padding='VALID',name='pool')
pooled_outputs.append(pooled)
num_filters_total = num_filters * len(filter_sizes)
self.h_pool = tf.concat(pooled_outputs,3)
self.h_pool_flat = tf.reshape(self.h_pool,[-1,num_filters_total])
with tf.name_scope('droupout'):
self.h_drop = tf.nn.dropout(self.h_pool_flat,keep_prob=self.dropout_keep_prob)
with tf.name_scope('output'):
W = tf.get_variable("W",shape=[num_filters_total,num_classes],initializer=tf.contrib.layers.xavier_initializer())
b = tf.Variable(tf.constant(0.1,shape=[num_classes],name='b'))
l2_loss += tf.nn.l2_loss(W)
l2_loss += tf.nn.l2_loss(b)
self.scores = tf.nn.xw_plus_b(self.h_drop,W,b,name='scores')
self.predictions = tf.argmax(self.scores,1,name='predictions')
with tf.name_scope("loss"):
losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.scores,labels=self.input_y)
self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
with tf.name_scope('accuracy'):
correct_predictions = tf.equal(self.predictions,tf.argmax(self.input_y,1))
self.accuracy = tf.reduce_mean(tf.cast(correct_predictions,"float"),name='accuracy')
self.train_op = tf.train.AdamOptimizer(1e-4).minimize(self.loss)
from dl_learn.utils import split_toutiao_to_train_test
from dl_learn.utils import read_vocab
def main(batch_size = 32,train_epochs = 1000,dropout_prob=0.5,save_path='./model/toutiao/textcnn/cnn.ckpt'):
# 读取词典
vocab = read_vocab()
vocab_size = len(vocab)
train_x, train_y, test_x, test_y, label = split_toutiao_to_train_test(test_size=0.02)
num_classes = len(label)
train_rows = train_x.shape[0]
sequence_length = train_x.shape[-1]
# 构建TextCNN模型
textcnn = TextCNN(sequence_length=sequence_length,num_classes=num_classes,vocab_size=vocab_size
,embedding_size=64,filter_sizes=[3,4,5],num_filters=16,l2_reg_lambda=0.01)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(max_to_keep=3)
for epoch in range(1,1+train_epochs):
# 每次迭代均生成批量数据 利用Dataset生成batch数据
if epoch>1:
train_x, train_y, test_x, test_y, label = split_toutiao_to_train_test(test_size=0.02)
step_start = 0
go_batch = True
all_steps = 1
while go_batch:
x = train_x[step_start:step_start+batch_size]
y = train_y[step_start:step_start+batch_size]
if x.shape[0] == 0:
print('---------next epoch---------')
go_batch = False
else:
step_start += batch_size
_,train_loss,train_accuracy = sess.run([textcnn.train_op,textcnn.loss,textcnn.accuracy],
feed_dict={textcnn.input_x:x,textcnn.input_y:y,textcnn.dropout_keep_prob:dropout_prob})
if all_steps % 10 == 0:
test_loss, test_accuracy = sess.run([textcnn.loss, textcnn.accuracy],
feed_dict={textcnn.input_x: test_x, textcnn.input_y: test_y,
textcnn.dropout_keep_prob: 1.0})
print('epoch:{},steps:{},train loss:{},train accuracy:{},test loss:{},test accuracy:{}'.format(epoch, all_steps, train_loss,
train_accuracy, test_loss,test_accuracy))
all_steps+=1
saver.save(sess,save_path=save_path,global_step=epoch)
if __name__ == '__main__':
tf.app.run(main(batch_size=128))
split_toutiao_to_train_test() 是一个读取数据的函数,输入参数是测试集的占比的大小 返回参数是 训练集X,训练集Y,测试X,测试Y,类标集合。
read_vocab()就是读取完整的词库字典,我这里是60万的词库。