题目

问题

答案

代码
#!/usr/bin/python3
#-*-coding:GBK -*-
from nltk.probability import FreqDist
neg_str = ['just plain boring ', 'entirely predictable and lacks energy ',
'no surprises and very few laughs']
pos_str = ['very powerful ','the most fun film of the summer']
connect_neg_str = ' '.join(neg_str)
connect_pos_str = ' '.join(pos_str)
def count_freq(words):
"[('the', 2), ('very', 1), ...]"
fdist = FreqDist(words.split())
tops = fdist.most_common(50)
return tops
def count_len(words):
"words单词数"
l = words.split()
return len(l)
def get_V_n_P():
V = len(count_freq(connect_neg_str + ' ' + connect_pos_str)) # 当前训练文本的词汇量
print('|V| = %d'%V, end = ", ")
n_neg = count_len(connect_neg_str) # neg_str的单词数(算重复)
print('n- = %d'%n_neg, end = ", ")
n_pos = count_len(connect_pos_str) # pos_str的单词数(算重复)
print('n+ = %d'%n_pos)
# 各极性类型出现的概率(句子数/句子总数)
str_num = len(neg_str) + len(pos_str)
p_neg = len(neg_str) / str_num # neg_str句子数 / 句子总数
print('P(-) = %d/%d'%(len(neg_str), str_num), end = ", ")
p_pos = len(pos_str) / str_num # pos_str句子数 / 句子总数
print('P(+) = %d/%d'%(len(pos_str), str_num))
return V, n_neg, n_pos, p_neg, p_pos
def print_p_words(p_words):
i = 0
for key,value in p_words.items():
if i % 3 == 0 and i != 0: print(" ")
i += 1
print('P(%s|-) = %d/%d'%(key, value[0], value[1]), end=" \t")
print("\n=================")
def get_p_words(V, n_neg, n_pos):
# P(w_i) = (C(w_i) + 1) / sum_j(C(w_j) + 1) = (C(w_i) + 1) / (N + V)
# p_neg_words(w_i) = (w_i在neg_str中出现的次数+1)/(neg_str的单词数+当前训练文本的词汇量)
p_neg_words = {}
p_pos_words = {}
dic_neg = count_freq(connect_neg_str) #[('and', 2), ('just', 1), ...]
for words in dic_neg:
p_neg_words[words[0]] = [(words[1] + 1), (n_neg + V)]
p_pos_words[words[0]] = [(0 + 1), (n_pos + V)]
dic_pos = count_freq(connect_pos_str) #[('the', 2), ('very', 1), ...]
for words in dic_pos:
p_pos_words[words[0]] = [(words[1] + 1), (n_pos + V)]
if words[0] not in p_neg_words:
p_neg_words[words[0]] = [(0 + 1), (n_neg + V)]
print_p_words(p_neg_words)
print_p_words(p_pos_words)
return p_neg_words, p_pos_words
def test(test_str, p_neg, p_pos, p_neg_words, p_pos_words):
# c_{NB} = argmax_{c_j∈C}P(c_j) ∏_{i∈positions}P(w_i|c_j)
# neg = p_neg * ∏_{i∈positions}(p_neg_words(w_i)[0] / p_neg_words(w_i)[1])
dic_test = count_freq(test_str)
neg = p_neg
pos = p_pos
for words in dic_test:
if words[0] in p_neg_words:
neg *= p_neg_words[words[0]][0] / p_neg_words[words[0]][1]
pos *= p_pos_words[words[0]][0] / p_pos_words[words[0]][1]
print('P(-|\"predictable with no originality\") = %.6f'%neg)
print('P(+|\"predictable with no originality\") = %.6f'%pos)
if neg > pos:
print("P(-|\"predictable with no originality\") is greater, \
so the test set sentence is classified as class negative.")
else:
print('P(+|\"predictable with no originality\") is greater, \
so the test set sentence is classified as class positive.')
if __name__ == '__main__':
V, n_neg, n_pos, p_neg, p_pos = get_V_n_P()
print("=================")
p_neg_words, p_pos_words = get_p_words(V, n_neg, n_pos)
test_str = 'predictable with no originality'
test(test_str, p_neg, p_pos, p_neg_words, p_pos_words)
其他问题



