自然语言处理与知识图谱week7 | 情感分析

题目

问题

自然语言处理与知识图谱week7 | 情感分析

答案

自然语言处理与知识图谱week7 | 情感分析

代码

#!/usr/bin/python3
#-*-coding:GBK -*-
from nltk.probability import FreqDist

neg_str = ['just plain boring ', 'entirely predictable and lacks energy ', 
			'no surprises and very few laughs']
pos_str = ['very powerful ','the most fun film of the summer']
connect_neg_str = ' '.join(neg_str)
connect_pos_str = ' '.join(pos_str)

def count_freq(words):
    "[('the', 2), ('very', 1), ...]"
    fdist = FreqDist(words.split())
    tops = fdist.most_common(50)
    return tops

def count_len(words):
    "words单词数"
    l = words.split()
    return len(l)

def get_V_n_P():
    V = len(count_freq(connect_neg_str + ' ' + connect_pos_str)) # 当前训练文本的词汇量
    print('|V| = %d'%V, end = ", ") 
    n_neg = count_len(connect_neg_str) # neg_str的单词数（算重复）
    print('n- = %d'%n_neg, end = ", ") 
    n_pos = count_len(connect_pos_str) # pos_str的单词数（算重复）
    print('n+ = %d'%n_pos)

    # 各极性类型出现的概率（句子数/句子总数）
    str_num = len(neg_str) + len(pos_str)
    p_neg = len(neg_str) / str_num # neg_str句子数 / 句子总数
    print('P(-) = %d/%d'%(len(neg_str), str_num), end = ", ")
    p_pos = len(pos_str) / str_num # pos_str句子数 / 句子总数
    print('P(+) = %d/%d'%(len(pos_str), str_num))

    return V, n_neg, n_pos, p_neg, p_pos

def print_p_words(p_words):
    i = 0
    for key,value in p_words.items():
        if i % 3 == 0 and i != 0: print(" ")
        i += 1
        print('P(%s|-) = %d/%d'%(key, value[0], value[1]), end=" \t")
    print("\n=================")

def get_p_words(V, n_neg, n_pos):
    # P(w_i) = (C(w_i) + 1) / sum_j(C(w_j) + 1) = (C(w_i) + 1) / (N + V) 
    # p_neg_words(w_i) = （w_i在neg_str中出现的次数+1）/（neg_str的单词数+当前训练文本的词汇量）
    p_neg_words = {}
    p_pos_words = {}

    dic_neg = count_freq(connect_neg_str) #[('and', 2), ('just', 1), ...]
    for words in dic_neg:
        p_neg_words[words[0]] = [(words[1] + 1), (n_neg + V)]
        p_pos_words[words[0]] = [(0 + 1), (n_pos + V)]
    
    dic_pos = count_freq(connect_pos_str) #[('the', 2), ('very', 1), ...]
    for words in dic_pos:
        p_pos_words[words[0]] = [(words[1] + 1), (n_pos + V)]
        if words[0] not in p_neg_words:
            p_neg_words[words[0]] = [(0 + 1), (n_neg + V)]

    print_p_words(p_neg_words)
    print_p_words(p_pos_words)

    return p_neg_words, p_pos_words

def test(test_str, p_neg, p_pos, p_neg_words, p_pos_words):
    # c_{NB} = argmax_{c_j∈C}P(c_j) ∏_{i∈positions}P(w_i|c_j)
    # neg = p_neg *  ∏_{i∈positions}(p_neg_words(w_i)[0] / p_neg_words(w_i)[1])
    dic_test = count_freq(test_str)
    neg = p_neg
    pos = p_pos
    for words in dic_test:
        if words[0] in p_neg_words:
            neg *= p_neg_words[words[0]][0] / p_neg_words[words[0]][1]
            pos *= p_pos_words[words[0]][0] / p_pos_words[words[0]][1]

    print('P(-|\"predictable with no originality\") = %.6f'%neg)
    print('P(+|\"predictable with no originality\") = %.6f'%pos)

    if neg > pos:
        print("P(-|\"predictable with no originality\") is greater, \
so the test set sentence is classified as class negative.")
    else:
        print('P(+|\"predictable with no originality\") is greater, \
so the test set sentence is classified as class positive.')

if __name__ == '__main__':
    V, n_neg, n_pos, p_neg, p_pos = get_V_n_P()
    print("=================")
    p_neg_words, p_pos_words = get_p_words(V, n_neg, n_pos)
    
    test_str = 'predictable with no originality'
    test(test_str, p_neg, p_pos, p_neg_words, p_pos_words)

其他问题

自然语言处理与知识图谱week7 | 情感分析

自然语言处理与知识图谱week7 | 情感分析

题目

问题

答案

代码

其他问题

相关推荐