词义消歧

  词义消歧在nlp中是重要的核心问题之一,词义、句义以及篇章含义层次都会根据不同的上下文环境产生不同的意义,消歧就是指根据上下文确定对象语义的过程。词义消歧是在词语层次上的语义消歧,常常在搜索引擎、意见挖掘、文本理解与产生、推理中具有应用。

一、常用的算法

1、监督学习算法

  a.确定词表和释义表,如目标词“bass”,有两个释义:乐器-贝斯,鱼类-鲈鱼;

  b.获取语料:Google、百度

  c.特征提取:一般先设定一个窗口,只关心这个窗口的词。

  d.分类器选择:朴素贝叶斯、逻辑回归、SVM、KNN、神经网络

例如:基于贝叶斯分类

  任何的多义词含义都跟上下文语境相关。假设语境(context)为c,语义为s则

P(s|c)=p(c|s)*p(s)/p(c),可以根据大量的语料统计,从而计算得到结果。

2、半监督学习算法

  当目标词没有足够的语料的时候,从少量手动标注启动,按照同一共现释义中,不同词出现频率进行扩展,如bass的鲈鱼解释一般与fish共现;乐器贝斯解释一般与play共现。因此可以标注所有<fish, bass> 和 <play, bass>的语句。

3、无监督学习算法

  一种贝叶斯分类器,参数估计不是基于有标注的训练语料,而是先随机初始化参数p(v|s),根据EM算法重新估计概率值,对每个词义上下文计算得到p(c|s),不断迭代从而得到最终分类的模型,最终利用余弦相似性计算得到结果。

4、其他方法

  如基于语义角色标注、依存句法分析,可以对某些问题得到一个比较好的结果。

二、本文算法描述

  主要通过百度百科,将待输入的消歧词进行查询,从而得到相关义项,并将消歧句子与各个义项所表示的句子进行相似性计算,从而得到与之相关的该消歧词的意思,有点远程监督的味道。

词义消歧

具体代码如下:

import os
from urllib import request
from lxml import etree
from urllib import parse
import jieba.posseg as pseg
import jieba.analyse as anse
import numpy as np

embedding_size =300
embedding_path = "D:\workspace\project\\NLPcase\\mutilSenmanticWord\\data\\wrod_vec_300.bin"
sim_limit = 0.8

def get_html(url):
    '''根据url,获取html页面'''
    return request.urlopen(url).read().decode('utf-8').replace('&nbsp','')
def collect_mutilsens(word):
    '''根据单词,到百度百科上面去查询各种语义相关的句子'''
    url = "http://baike.baidu.com/item/%s?force=1"%parse.quote(word)#parser.quote 对含有特殊符号的URL进行编码,使其转换为合法的url字符串
    html = get_html(url)
    selector = etree.HTML(html)
    sens = [ ''.join(i.split(':'))for i in selector.xpath('//li[@class="list-dot list-dot-paddingleft"]/div/a/text()')]
    sens_link = ['http://baike.baidu.com' + i for i in selector.xpath('//li[@class="list-dot list-dot-paddingleft"]/div/a/@href')]
    sens_dict = {sens[i]:sens_link[i] for i in range(len(sens))}
    return sens_dict

def extract_concept(desc):
    '''概念抽取'''
    desc_seg = [[i.word,i.flag] for i in pseg.cut(desc)]
    concepts_candi = [i[0] for i in desc_seg if i[1] in ['n','b','v','d']]
    return concepts_candi[-1]

def entity_clusters(s):
    '''对具有联通边的实体进行聚类'''
    clusters = []
    for i in range(len(s)):
        cluster = s[i]
        for j in range(len(s)):
            if set(s[i]).intersection(set(s[j])) and set(s[i]).intersection(set(cluster)) and set(
                    s[j]).intersection(set(cluster)):
                cluster += s[i]
                cluster += s[j]
        if set(cluster) not in clusters:
            clusters.append(set(cluster))

    return clusters

def similarity_cosine(vector1, vector2):
    '''计算问句与库中问句的相似度,对候选结果加以二次筛选'''
    cos1 = np.sum(vector1*vector2)
    cos21 = np.sqrt(sum(vector1**2))
    cos22 = np.sqrt(sum(vector2**2))
    similarity = cos1/float(cos21*cos22)
    if str(similarity) == 'nan':
        return 0.0
    else:
        return similarity

def get_wordvector(word):
    '''获取单个词的词向量'''
    return np.array(embdding_dict.get(word, [0]*embedding_size))

def load_embedding(embedding_path):
    '''加载词向量'''
    embedding_dict = {}
    count = 0
    for line in open(embedding_path):
        line = line.strip().split(' ')
        if len(line) < 300:
            continue
        wd = line[0]
        vector = np.array([float(i) for i in line[1:]])
        embedding_dict[wd] = vector
        count += 1
        if count%10000 == 0:
            print(count, 'loaded')
    print('loaded %s word embedding, finished'%count)
    return embedding_dict
embdding_dict = load_embedding(embedding_path)
def concept_cluster(concept_dict):
    '''词的义项聚类'''
    sens_list = []
    cluster_sens_dict = {}
    for sen1 in concept_dict:
        sen1_list = [sen1]
        for sen2 in concept_dict:
            if sen1 == sen2:
                continue
            sim_score = similarity_cosine(get_wordvector(sen1),get_wordvector(sen2))
            if sim_score>= sim_limit:
                sen1_list.append(sen2)
        sens_list.append(sen1_list)
    sens_clusters = entity_clusters(sens_list)
    for sens in sens_clusters:
        symbol_sen = list(sens)[0]
        cluster_sens_dict[symbol_sen] = concept_dict[symbol_sen]
    return cluster_sens_dict

def extract_desc(link):
    '''获取该义项的描述信息,作为该义项的意义描述'''
    html = get_html(link)
    selector = etree.HTML(html)
    keywords = selector.xpath('//meta[@name="keywords"]/@content')
    desc = selector.xpath('//meta[@name="description"]/@content')
    return desc,keywords


def collect_concepts(wd):
    '''多义词主函数'''
    sens_dict = collect_mutilsens(wd)
    if not sens_dict:
        return {}
    concepts_dict = {}
    concept_dict = {}
    for sen,link in sens_dict.items():
        concept = extract_concept(sen)
        if concept not in concept_dict:
            concept_dict[concept] = [link]
        else:
            concept_dict[concept].append(link)
        # 对抽取的概念进行聚合
    cluster_concept_dict = concept_cluster(concept_dict)
    for concept,links in cluster_concept_dict.items():
        # 获取对应义项的连接页面内容,并进行处理
        link = links[0]
        desc, keywords = extract_desc(link)
        context = ''.join(desc+[' '+keywords])# 将两个内容进行合并,作为其描述
        concepts_dict[concept] = context
    return concepts_dict
#------------------------------------------句子的语义级别表示----------------------
# 对义项的描述信息进行关键词提取,作为该义项的一个结构化表示
def extract_keywords(sent):
    keywords = [i for i in anse.extract_tags(sent,topK=20,withWeight=False)]# 结巴的关键词提取
    return keywords
# 基于word2vector,通过lookup table的方式找到句子的wordvector的表示
def rep_sentVector(sent):
    word_list = extract_keywords(sent)
    embedding = np.zeros(embedding_size)
    sent_len = 0
    for index, wd in enumerate(word_list):
        if wd in embdding_dict:
            embedding += embdding_dict.get(wd)# 通过求和的方式表示该句子
            sent_len += 1
        else:
            continue
    return  embedding/sent_len
# 基于词语相似度,计算句子的相似度
def distance_words(sent1,sent2):
    wds1 = extract_keywords(sent1)
    wds2 = extract_keywords(sent2)
    score_wds1 = []
    score_wds2 = []
    for word1 in wds1:
        score = max([similarity_cosine(get_wordvector(word1),get_wordvector(word2)) for word2 in wds2])
        score_wds1.append(score)
    for word2 in wds2:
        score = max([similarity_cosine(get_wordvector(word2),get_wordvector(word1)) for word1 in wds1])
        score_wds2.append(score)
    sim_score = max(sum(score_wds1)/len(wds1),sum(score_wds2)/len(wds2))
    return sim_score

#-----------------------对该词进行消歧---------------
def detect_main(sent,word):
    sent = sent.replace(word,'')
    # 多义词的获取
    concept_dict = collect_concepts(word)
    # 待消句子的表示
    sent_vector = rep_sentVector(sent)
    concept_scores_sent = {}
    concept_scores_wds = {}
    for concept, desc in concept_dict.items():
        concept_vector = rep_sentVector(desc)
        similarity_sent = similarity_cosine(sent_vector,concept_vector)
        concept_scores_sent[concept] = similarity_sent
        similarity_wds = distance_words(desc,sent)#另一种计算相似度的方法
        concept_scores_wds[concept] = similarity_wds
    concept_scores_sent = sorted(concept_scores_sent.items(),key=lambda asd:asd[1],reverse=True)
    concept_scores_wds = sorted(concept_scores_wds.items(),key=lambda asd:asd[1],reverse=True)
    return concept_scores_wds,concept_scores_sent

三、参考资料

  https://blog.****.net/liguochao1001/article/details/86596183

  http://www.xjishu.com/zhuanli/55/201810179896.html

  https://blog.****.net/weixin_38776853/article/details/79522149

  https://blog.****.net/Uwr44UOuQcNsUQb60zk2/article/details/81074410

  https://github.com/liuhuanyong/WordMultiSenseDisambiguation/blob/master/wordsense_detect.py