关于疾病、化学物质、基因|蛋白质的命名实体识别代码实现

命名实体识别实现的功能：

训练数据集出处：

Entity type	Corpus	Text genre	Text type	No. sentences	No. tokens	No. unique tokens	No. annotations	No. unique annotations
Chemicals	BioSemantics	Patent	Full-text	163219	6608020	173193	386110	72782
	CDR (	Scientific Article	Abstract	14166	326506	22083	15915	2623
	CHEMDNER patent	Patent	Abstract	35679	1495524	60850	65685	20630
Diseases	Arizona Disease	Scientific Article	Abstract	2804	74346	8133	3425	1266
	CDR	Scientific Article	Abstract	14166	326506	22083	12617	3113
	miRNA	Scientific Article	Abstract	2676	66419	7638	2159	606
	NCBI Disease	Scientific Article	Abstract	7645	173283	12534	6881	2136
	Variome	Scientific Article	Full-text	6274	177119	12307	5904	451
Species	CellFinder	Scientific Article	Full-text	2234	66519	7584	479	42
	Linneaus	Scientific Article	Full-text	19048	491253	33132	4259	419
	LocText	Scientific Article	Abstract	956	22756	4335	276	37
	miRNA	Scientific Article	Abstract	2676	66419	7638	722	41
	S800	Scientific Article	Abstract	8356	198091	19992	3708	1503
	Variome	Scientific Article	Full-text	6274	177119	12307	182	8
Genes/proteins	BioCreative II GM	Scientific Article	Abstract	20384	514146	49365	24596	15841
	BioInfer	Scientific Article	Abstract	1147	34187	5200	4378	1041
	CellFinder	Scientific Article	Full-text	2234	66519	7584	1750	734
	DECA	Scientific Article	Abstract	5492	139771	14053	6324	2127
	FSU-PRGE	Scientific Article	Abstract	35361	914717	453634	59489	27363
	IEPA	Scientific Article	Abstract	241	15365	2871	1110	130
	LocText	Scientific Article	Abstract	956	22756	4335	1395	549
	miRNA	Scientific Article	Abstract	2676	66419	7638	1058	370
	Variome	Scientific Article	Full-text	6274	177119	12307	4617	458

模型结构图：

关于疾病、化学物质、基因|蛋白质的命名实体识别代码实现

模型代码具体实现：

import tensorflow as tf
import re
import time
import utils_tf
import utils_nlp
import os
import pickle

def bidirectional_LSTM(input, hidden_state_dimension, initializer, sequence_length=None, output_sequence=True):

with tf.variable_scope("bidirectional_LSTM"):
if sequence_length == None:
batch_size = 1
sequence_length = tf.shape(input)[1]
sequence_length = tf.expand_dims(sequence_length, axis=0, name='sequence_length')
else:
batch_size = tf.shape(sequence_length)[0]

lstm_cell = {}
initial_state = {}
for direction in ["forward", "backward"]:
with tf.variable_scope(direction):
# LSTM cell
lstm_cell[direction] = tf.contrib.rnn.CoupledInputForgetGateLSTMCell(hidden_state_dimension, forget_bias=1.0, initializer=initializer, state_is_tuple=True)
# initial state: http://stackoverflow.com/questions/38441589/tensorflow-rnn-initial-state
initial_cell_state = tf.get_variable("initial_cell_state", shape=[1, hidden_state_dimension], dtype=tf.float32, initializer=initializer)
initial_output_state = tf.get_variable("initial_output_state", shape=[1, hidden_state_dimension], dtype=tf.float32, initializer=initializer)
c_states = tf.tile(initial_cell_state, tf.stack([batch_size, 1]))
h_states = tf.tile(initial_output_state, tf.stack([batch_size, 1]))
initial_state[direction] = tf.contrib.rnn.LSTMStateTuple(c_states, h_states)

# sequence_length must be provided for tf.nn.bidirectional_dynamic_rnn due to internal bug
outputs, final_states = tf.nn.bidirectional_dynamic_rnn(lstm_cell["forward"],
lstm_cell["backward"],
input,
dtype=tf.float32,
sequence_length=sequence_length,
initial_state_fw=initial_state["forward"],
initial_state_bw=initial_state["backward"])
if output_sequence == True:
outputs_forward, outputs_backward = outputs
output = tf.concat([outputs_forward, outputs_backward], axis=2, name='output_sequence')
else:
# max pooling
# outputs_forward, outputs_backward = outputs
# output = tf.concat([outputs_forward, outputs_backward], axis=2, name='output_sequence')
# output = tf.reduce_max(output, axis=1, name='output')
# last pooling
final_states_forward, final_states_backward = final_states
output = tf.concat([final_states_forward[1], final_states_backward[1]], axis=1, name='output')

return output

class EntityLSTM(object):
"""
An LSTM architecture for named entity recognition.
Uses a character embedding layer followed by an LSTM to generate vector representation from characters for each token.
Then the character vector is concatenated with token embedding vector, which is input to another LSTM followed by a CRF layer.
"""
def __init__(self, dataset, parameters):

self.verbose = False

# Placeholders for input, output and dropout
self.input_token_indices = tf.placeholder(tf.int32, [None], name="input_token_indices")
self.input_label_indices_vector = tf.placeholder(tf.float32, [None, dataset.number_of_classes], name="input_label_indices_vector")
self.input_label_indices_flat = tf.placeholder(tf.int32, [None], name="input_label_indices_flat")
self.input_token_character_indices = tf.placeholder(tf.int32, [None, None], name="input_token_indices")
self.input_token_lengths = tf.placeholder(tf.int32, [None], name="input_token_lengths")
self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

# Internal parameters
initializer = tf.contrib.layers.xavier_initializer()

if parameters['use_character_lstm']:
# Character-level LSTM
# Idea: reshape so that we have a tensor [number_of_token, max_token_length, token_embeddings_size], which we pass to the LSTM

# Character embedding layer
with tf.variable_scope("character_embedding"):
self.character_embedding_weights = tf.get_variable(
"character_embedding_weights",
shape=[dataset.alphabet_size, parameters['character_embedding_dimension']],
initializer=initializer)
embedded_characters = tf.nn.embedding_lookup(self.character_embedding_weights, self.input_token_character_indices, name='embedded_characters')
if self.verbose: print("embedded_characters: {0}".format(embedded_characters))
utils_tf.variable_summaries(self.character_embedding_weights)

# Character LSTM layer
with tf.variable_scope('character_lstm') as vs:
character_lstm_output = bidirectional_LSTM(embedded_characters, parameters['character_lstm_hidden_state_dimension'], initializer,
sequence_length=self.input_token_lengths, output_sequence=False)
self.character_lstm_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name)

# Token embedding layer
with tf.variable_scope("token_embedding"):
self.token_embedding_weights = tf.get_variable(
"token_embedding_weights",
shape=[dataset.vocabulary_size, parameters['token_embedding_dimension']],
initializer=initializer,
trainable=not parameters['freeze_token_embeddings'])
embedded_tokens = tf.nn.embedding_lookup(self.token_embedding_weights, self.input_token_indices)
utils_tf.variable_summaries(self.token_embedding_weights)

# Concatenate character LSTM outputs and token embeddings
if parameters['use_character_lstm']:
with tf.variable_scope("concatenate_token_and_character_vectors"):
if self.verbose: print('embedded_tokens: {0}'.format(embedded_tokens))
token_lstm_input = tf.concat([character_lstm_output, embedded_tokens], axis=1, name='token_lstm_input')
if self.verbose: print("token_lstm_input: {0}".format(token_lstm_input))
else:
token_lstm_input = embedded_tokens

# Add dropout
with tf.variable_scope("dropout"):
token_lstm_input_drop = tf.nn.dropout(token_lstm_input, self.dropout_keep_prob, name='token_lstm_input_drop')
if self.verbose: print("token_lstm_input_drop: {0}".format(token_lstm_input_drop))
# https://www.tensorflow.org/api_guides/python/contrib.rnn
# Prepare data shape to match `rnn` function requirements
# Current data input shape: (batch_size, n_steps, n_input)
# Required shape: 'n_steps' tensors list of shape (batch_size, n_input)
token_lstm_input_drop_expanded = tf.expand_dims(token_lstm_input_drop, axis=0, name='token_lstm_input_drop_expanded')
if self.verbose: print("token_lstm_input_drop_expanded: {0}".format(token_lstm_input_drop_expanded))

# Token LSTM layer
with tf.variable_scope('token_lstm') as vs:
token_lstm_output = bidirectional_LSTM(token_lstm_input_drop_expanded, parameters['token_lstm_hidden_state_dimension'], initializer, output_sequence=True)
token_lstm_output_squeezed = tf.squeeze(token_lstm_output, axis=0)
self.token_lstm_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name)

# Needed only if Bidirectional LSTM is used for token level
with tf.variable_scope("feedforward_after_lstm") as vs:
W = tf.get_variable(
"W",
shape=[2 * parameters['token_lstm_hidden_state_dimension'], parameters['token_lstm_hidden_state_dimension']],
initializer=initializer)
b = tf.Variable(tf.constant(0.0, shape=[parameters['token_lstm_hidden_state_dimension']]), name="bias")
outputs = tf.nn.xw_plus_b(token_lstm_output_squeezed, W, b, name="output_before_tanh")
outputs = tf.nn.tanh(outputs, name="output_after_tanh")
utils_tf.variable_summaries(W)
utils_tf.variable_summaries(b)
self.token_lstm_variables += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name)

with tf.variable_scope("feedforward_before_crf") as vs:
W = tf.get_variable(
"W",
shape=[parameters['token_lstm_hidden_state_dimension'], dataset.number_of_classes],
initializer=initializer)
b = tf.Variable(tf.constant(0.0, shape=[dataset.number_of_classes]), name="bias")
scores = tf.nn.xw_plus_b(outputs, W, b, name="scores")
self.unary_scores = scores
self.predictions = tf.argmax(self.unary_scores, 1, name="predictions")
utils_tf.variable_summaries(W)
utils_tf.variable_summaries(b)
self.feedforward_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name)

# CRF layer
if parameters['use_crf']:
with tf.variable_scope("crf") as vs:
# Add start and end tokens
small_score = -1000.0
large_score = 0.0
sequence_length = tf.shape(self.unary_scores)[0]
unary_scores_with_start_and_end = tf.concat([self.unary_scores, tf.tile( tf.constant(small_score, shape=[1, 2]) , [sequence_length, 1])], 1)
start_unary_scores = [[small_score] * dataset.number_of_classes + [large_score, small_score]]
end_unary_scores = [[small_score] * dataset.number_of_classes + [small_score, large_score]]
self.unary_scores = tf.concat([start_unary_scores, unary_scores_with_start_and_end, end_unary_scores], 0)
start_index = dataset.number_of_classes
end_index = dataset.number_of_classes + 1
input_label_indices_flat_with_start_and_end = tf.concat([ tf.constant(start_index, shape=[1]), self.input_label_indices_flat, tf.constant(end_index, shape=[1]) ], 0)

# Apply CRF layer
sequence_length = tf.shape(self.unary_scores)[0]
sequence_lengths = tf.expand_dims(sequence_length, axis=0, name='sequence_lengths')
unary_scores_expanded = tf.expand_dims(self.unary_scores, axis=0, name='unary_scores_expanded')
input_label_indices_flat_batch = tf.expand_dims(input_label_indices_flat_with_start_and_end, axis=0, name='input_label_indices_flat_batch')
if self.verbose: print('unary_scores_expanded: {0}'.format(unary_scores_expanded))
if self.verbose: print('input_label_indices_flat_batch: {0}'.format(input_label_indices_flat_batch))
if self.verbose: print("sequence_lengths: {0}".format(sequence_lengths))
# https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/crf
# Compute the log-likelihood of the gold sequences and keep the transition params for inference at test time.
self.transition_parameters=tf.get_variable(
"transitions",
shape=[dataset.number_of_classes+2, dataset.number_of_classes+2],
initializer=initializer)
utils_tf.variable_summaries(self.transition_parameters)
log_likelihood, _ = tf.contrib.crf.crf_log_likelihood(
unary_scores_expanded, input_label_indices_flat_batch, sequence_lengths, transition_params=self.transition_parameters)
self.loss = tf.reduce_mean(-log_likelihood, name='cross_entropy_mean_loss')
self.accuracy = tf.constant(1)

self.crf_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name)

# Do not use CRF layer
else:
with tf.variable_scope("crf") as vs:
self.transition_parameters = tf.get_variable(
"transitions",
shape=[dataset.number_of_classes+2, dataset.number_of_classes+2],
initializer=initializer)
utils_tf.variable_summaries(self.transition_parameters)
self.crf_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name)

# Calculate mean cross-entropy loss
with tf.variable_scope("loss"):
losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.unary_scores, labels=self.input_label_indices_vector, name='softmax')
self.loss = tf.reduce_mean(losses, name='cross_entropy_mean_loss')
with tf.variable_scope("accuracy"):
correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_label_indices_vector, 1))
self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, 'float'), name='accuracy')

self.define_training_procedure(parameters)
self.summary_op = tf.summary.merge_all()
self.saver = tf.train.Saver(max_to_keep=parameters['maximum_number_of_epochs']) # defaults to saving all variables

def define_training_procedure(self, parameters):
# Define training procedure
self.global_step = tf.Variable(0, name="global_step", trainable=False)
if parameters['optimizer'] == 'adam':
self.optimizer = tf.train.AdamOptimizer(parameters['learning_rate'])
elif parameters['optimizer'] == 'sgd':
self.optimizer = tf.train.GradientDescentOptimizer(parameters['learning_rate'])
elif parameters['optimizer'] == 'adadelta':
self.optimizer = tf.train.AdadeltaOptimizer(parameters['learning_rate'])
else:
raise ValueError('The lr_method parameter must be either adadelta, adam or sgd.')

grads_and_vars = self.optimizer.compute_gradients(self.loss)
if parameters['gradient_clipping_value']:
grads_and_vars = [(tf.clip_by_value(grad, -parameters['gradient_clipping_value'], parameters['gradient_clipping_value']), var)
for grad, var in grads_and_vars]
# By defining a global_step variable and passing it to the optimizer we allow TensorFlow handle the counting of training steps for us.
# The global step will be automatically incremented by one every time you execute train_op.
self.train_op = self.optimizer.apply_gradients(grads_and_vars, global_step=self.global_step)

def load_pretrained_token_embeddings(self, sess, dataset, parameters, token_to_vector=None):
if parameters['token_pretrained_embedding_filepath'] == '':
return
# Load embeddings
start_time = time.time()
print('Load token embeddings... ', end='', flush=True)
if token_to_vector == None:
token_to_vector = utils_nlp.load_pretrained_token_embeddings(parameters)

initial_weights = sess.run(self.token_embedding_weights.read_value())
number_of_loaded_word_vectors = 0
number_of_token_original_case_found = 0
number_of_token_lowercase_found = 0
number_of_token_digits_replaced_with_zeros_found = 0
number_of_token_lowercase_and_digits_replaced_with_zeros_found = 0
for token in dataset.token_to_index.keys():
if token in token_to_vector.keys():
initial_weights[dataset.token_to_index[token]] = token_to_vector[token]
number_of_token_original_case_found += 1
elif parameters['check_for_lowercase'] and token.lower() in token_to_vector.keys():
initial_weights[dataset.token_to_index[token]] = token_to_vector[token.lower()]
number_of_token_lowercase_found += 1
elif parameters['check_for_digits_replaced_with_zeros'] and re.sub('\d', '0', token) in token_to_vector.keys():
initial_weights[dataset.token_to_index[token]] = token_to_vector[re.sub('\d', '0', token)]
number_of_token_digits_replaced_with_zeros_found += 1
elif parameters['check_for_lowercase'] and parameters['check_for_digits_replaced_with_zeros'] and re.sub('\d', '0', token.lower()) in token_to_vector.keys():
initial_weights[dataset.token_to_index[token]] = token_to_vector[re.sub('\d', '0', token.lower())]
number_of_token_lowercase_and_digits_replaced_with_zeros_found += 1
else:
continue
number_of_loaded_word_vectors += 1
elapsed_time = time.time() - start_time
print('done ({0:.2f} seconds)'.format(elapsed_time))
print("number_of_token_original_case_found: {0}".format(number_of_token_original_case_found))
print("number_of_token_lowercase_found: {0}".format(number_of_token_lowercase_found))
print("number_of_token_digits_replaced_with_zeros_found: {0}".format(number_of_token_digits_replaced_with_zeros_found))
print("number_of_token_lowercase_and_digits_replaced_with_zeros_found: {0}".format(number_of_token_lowercase_and_digits_replaced_with_zeros_found))
print('number_of_loaded_word_vectors: {0}'.format(number_of_loaded_word_vectors))
print("dataset.vocabulary_size: {0}".format(dataset.vocabulary_size))
sess.run(self.token_embedding_weights.assign(initial_weights))

def load_embeddings_from_pretrained_model(self, sess, dataset, pretraining_dataset, pretrained_embedding_weights, embedding_type='token'):
if embedding_type == 'token':
embedding_weights = self.token_embedding_weights
index_to_string = dataset.index_to_token
pretraining_string_to_index = pretraining_dataset.token_to_index
elif embedding_type == 'character':
embedding_weights = self.character_embedding_weights
index_to_string = dataset.index_to_character
pretraining_string_to_index = pretraining_dataset.character_to_index
# Load embeddings
start_time = time.time()
print('Load {0} embeddings from pretrained model... '.format(embedding_type), end='', flush=True)
initial_weights = sess.run(embedding_weights.read_value())

if embedding_type == 'token':
initial_weights[dataset.UNK_TOKEN_INDEX] = pretrained_embedding_weights[pretraining_dataset.UNK_TOKEN_INDEX]
elif embedding_type == 'character':
initial_weights[dataset.PADDING_CHARACTER_INDEX] = pretrained_embedding_weights[pretraining_dataset.PADDING_CHARACTER_INDEX]

number_of_loaded_vectors = 1
for index, string in index_to_string.items():
if index == dataset.UNK_TOKEN_INDEX:
continue
if string in pretraining_string_to_index.keys():
initial_weights[index] = pretrained_embedding_weights[pretraining_string_to_index[string]]
number_of_loaded_vectors += 1
elapsed_time = time.time() - start_time
print('done ({0:.2f} seconds)'.format(elapsed_time))
print("number_of_loaded_vectors: {0}".format(number_of_loaded_vectors))
if embedding_type == 'token':
print("dataset.vocabulary_size: {0}".format(dataset.vocabulary_size))
elif embedding_type == 'character':
print("dataset.alphabet_size: {0}".format(dataset.alphabet_size))
sess.run(embedding_weights.assign(initial_weights))

def restore_from_pretrained_model(self, parameters, dataset, sess, token_to_vector=None):
pretraining_dataset = pickle.load(open(os.path.join(parameters['pretrained_model_folder'], 'dataset.pickle'), 'rb'))
pretrained_model_checkpoint_filepath = os.path.join(parameters['pretrained_model_folder'], 'model.ckpt')

# Assert that the label sets are the same
# Test set should have the same label set as the pretrained dataset
assert pretraining_dataset.index_to_label == dataset.index_to_label

# If the token and character mappings are exactly the same
if pretraining_dataset.index_to_token == dataset.index_to_token and pretraining_dataset.index_to_character == dataset.index_to_character:

# Restore the pretrained model
self.saver.restore(sess, pretrained_model_checkpoint_filepath) # Works only when the dimensions of tensor variables are matched.

# If the token and character mappings are different between the pretrained model and the current model
else:

# Resize the token and character embedding weights to match them with the pretrained model (required in order to restore the pretrained model)
utils_tf.resize_tensor_variable(sess, self.character_embedding_weights, [pretraining_dataset.alphabet_size, parameters['character_embedding_dimension']])
utils_tf.resize_tensor_variable(sess, self.token_embedding_weights, [pretraining_dataset.vocabulary_size, parameters['token_embedding_dimension']])

# Restore the pretrained model
self.saver.restore(sess, pretrained_model_checkpoint_filepath) # Works only when the dimensions of tensor variables are matched.

# Get pretrained embeddings
character_embedding_weights, token_embedding_weights = sess.run([self.character_embedding_weights, self.token_embedding_weights])

# Restore the sizes of token and character embedding weights
utils_tf.resize_tensor_variable(sess, self.character_embedding_weights, [dataset.alphabet_size, parameters['character_embedding_dimension']])
utils_tf.resize_tensor_variable(sess, self.token_embedding_weights, [dataset.vocabulary_size, parameters['token_embedding_dimension']])

# Re-initialize the token and character embedding weights
sess.run(tf.variables_initializer([self.character_embedding_weights, self.token_embedding_weights]))

# Load embedding weights from pretrained token embeddings first
self.load_pretrained_token_embeddings(sess, dataset, parameters, token_to_vector=token_to_vector)

# Load embedding weights from pretrained model
self.load_embeddings_from_pretrained_model(sess, dataset, pretraining_dataset, token_embedding_weights, embedding_type='token')
self.load_embeddings_from_pretrained_model(sess, dataset, pretraining_dataset, character_embedding_weights, embedding_type='character')
del pretraining_dataset
del character_embedding_weights
del token_embedding_weight
# Get transition parameters
transition_params_trained = sess.run(self.transition_parameters)
if not parameters['reload_character_embeddings']:
sess.run(tf.variables_initializer([self.character_embedding_weights]))
if not parameters['reload_character_lstm']:
sess.run(tf.variables_initializer(self.character_lstm_variables))
if not parameters['reload_token_embeddings']:
sess.run(tf.variables_initializer([self.token_embedding_weights]))
if not parameters['reload_token_lstm']:
sess.run(tf.variables_initializer(self.token_lstm_variables))
if not parameters['reload_feedforward']:
sess.run(tf.variables_initializer(self.feedforward_variables))
if not parameters['reload_crf']:
sess.run(tf.variables_initializer(self.crf_variables))
return transition_params_trained

结果分析：

（如果现在大家仍旧对自己的毕业设计抱有疑虑或者无从下手，可以加入2019年计算机类毕业生毕设讨论群，群号：457756921）

	Precision (%)		Recall (%)		F1-score (%)
	B	TL	B	TL	B	TL
Chemicals	87.10	87.05	89.19	89.47	88.08	88.21
Diseases	80.41	81.41	81.13	82.46	80.73	82.09
Species	84.18	84.52	84.44	90.12	84.20	87.01
Genes/proteins	82.09	83.38	80.85	83.08	81.20	83.09

关于疾病、化学物质、基因|蛋白质的命名实体识别代码实现

命名实体识别实现的功能：

训练数据集出处：

模型结构图：

模型代码具体实现：

结果分析：

相关推荐