NLP--day(5)(贝叶斯)
贝叶斯公式
贝叶斯模型描述
给定条件
假设我们的分类模型样本是:
代表有m个样本,每个样本有n个特征,特征输出有K个类别,定义为
目标
在以上给定条件后,我们希望贝叶斯模型能通过给定样本
,通过后验概率最大化来判断分类,预测出
推理过程
已知要求,根据贝叶斯公司可得:
是使,最大化的类别,数学表达式为:
算法过程
朴素贝叶斯优缺点
优点
- 算法逻辑简单,易于实现;
- 分类过程中时空开销小(假设特征相互独立,只会涉及到二维存储)
缺点
理论上,朴素贝叶斯模型与其他分类方法相比具有最小的误差率。但是实际上并非总是如此,这是因为朴素贝叶斯模型假设属性之间相互独立,这个假设在实际应用中往往是不成立的,在属性个数比较多或者属性之间相关性较大时,分类效果不好。
使用数据类型:标称型数据
贝叶斯文本分类
import random
import pandas as pd
import numpy as np
import jieba
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
with open("./cnews.train.txt", 'r', encoding="utf-8") as file:
train = file.readlines()
with open("./cnews.test.txt", 'r', encoding="utf-8") as file:
test = file.readlines()
def exactua(data):
random.shuffle(data) #数据集打乱
x_data = []
y_data = []
for line in data:
x_data.append(line.replace('\n', '').split('\t')[1])
y_data.append(line.replace('\n', '').split('\t')[0])
return x_data, y_data
x_train_data, y_train_data = exactua(train)
x_test_data, y_test_data = exactua(test)
x_train_data, y_train_data = x_train_data[:2000], y_train_data[:2000]
x_test_data, y_test_data= x_test_data[:200], y_test_data[:200]
import jieba
def stopwordslist():
stopwords = []
with open('stopwords.txt', 'r') as fr:
for line in fr:
stopwords.append(line[:-1])
return stopwords
stopwords = stopwordslist()
def toke_stopwords(x_train_data,x_test_data_):
#去停用词并进行结巴分词
train_lists=[]
for i in x_train_data:
word_list = [word for word in jieba.cut(i) if word not in stopwords]
train_lists.append(' '.join(word_list))
test_lists=[]
for i in x_test_data:
word_list = [word for word in jieba.cut(i) if word not in stopwords]
test_lists.append(' '.join(word_list))
return train_lists, test_lists
x_train, x_test = toke_stopwords(x_train_data, x_test_data)
len(x_train)
# len(x_test)
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\hp\AppData\Local\Temp\jieba.cache
Loading model cost 0.589 seconds.
Prefix dict has been built succesfully.
2000
def data2vec(x_train, x_test):
from sklearn.feature_extraction.text import CountVectorizer
count_vec=CountVectorizer(stop_words=stopwords) #创建词袋数据结构
X_count_train = count_vec.fit_transform(x_train) #<class 'scipy.sparse.csr.csr_matrix'>
#train和test转化为特征向量
X_count_train= X_count_train.toarray()
X_count_test = count_vec.transform(x_test).toarray()
return X_count_train, X_count_test, dict(count_vec.vocabulary_)
x_train, x_test, wordDic = data2vec(x_train, x_test)
F:\anaconda1\envs\baseline\lib\site-packages\sklearn\feature_extraction\text.py:300: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['lex', '①①', '①②', '①③', '①④', '①⑤', '①⑥', '①⑦', '①⑧', '①⑨', '①a', '①b', '①c', '①d', '①e', '①f', '①g', '①h', '①i', '①o', '②①', '②②', '②③', '②④', '②⑤', '②⑥', '②⑦', '②⑧', '②⑩', '②a', '②b', '②d', '②e', '②f', '②g', '②h', '②i', '②j', '③①', '③⑩', '③a', '③b', '③c', '③d', '③e', '③f', '③g', '③h', '④a', '④b', '④c', '④d', '④e', '⑤a', '⑤b', '⑤d', '⑤e', '⑤f', '12', 'li', 'zxfitl'] not in stop_words.
'stop_words.' % sorted(inconsistent))
x_train.shape
(2000, 63169)
print ('vocabulary:\n\n')
count = 0
for key,value in wordDic.items():
print(key,value)
count += 1
if count > 10:
break
vocabulary:
原作者 18243
认可 54166
山楂树 27207
张伟 29182
平称 28198
影片 29708
慢热 31183
新浪 35516
娱乐 24974
开拍 28961
上映 7058
def y2label(y_train, y_test):
dic = {}
count = 0
for i in set(y_train):
dic[i] = count
count += 1
y_train, y_test = [dic.get(i) for i in y_train_data], [dic.get(i) for i in y_test_data]
return y_train, y_test, dic
y_train, y_test, dicLabel = y2label(y_train_data, y_test_data)
y_train[:3]
[7, 3, 7]
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha = 0.01).fit(x_train,y_train)
# 训练集测试
result = clf.predict(x_train)
count = 0
for i in range(len(result)):
if result[i] == y_train[i]:
count += 1
print(count/len(x_train))
0.9985
# 测试集测试
result = clf.predict(x_test)
count = 0
for i in range(len(result)):
if result[i] == y_test[i]:
count += 1
print(count/len(x_test))
0.95
#结合Tf-idf 算法进行文本分类
import numpy as np
import random
with open("./cnews.train.txt", 'r', encoding="utf-8") as file:
train = file.readlines()
with open("./cnews.test.txt", 'r', encoding="utf-8") as file:
test = file.readlines()
x_train_data, y_train_data = exactua(train)
x_test_data, y_test_data = exactua(test)
# 2000条 训练集
# 200条 测试
x_train_data, y_train_data = x_train_data[:2000], y_train_data[:2000]
x_test_data, y_test_data= x_test_data[:200], y_test_data[:200]
x_train, x_test = toke_stopwords(x_train_data, x_test_data)
y_train, y_test, dicLabel = y2label(y_train_data, y_test_data)
def data2vec(x_train, x_test):
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
count_vec=CountVectorizer(stop_words=stopwords) #创建词袋数据结构
transformer = TfidfTransformer()
X_count_train = count_vec.fit_transform(x_train)
X_count_test = count_vec.transform(x_test)
X_train_tfidf = transformer.fit_transform(X_count_train).toarray()
X_test_tfidf = transformer.fit_transform(X_count_test).toarray()
return X_train_tfidf, X_test_tfidf
x_train, x_test = data2vec(x_train, x_test)
F:\anaconda1\envs\baseline\lib\site-packages\sklearn\feature_extraction\text.py:300: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['lex', '①①', '①②', '①③', '①④', '①⑤', '①⑥', '①⑦', '①⑧', '①⑨', '①a', '①b', '①c', '①d', '①e', '①f', '①g', '①h', '①i', '①o', '②①', '②②', '②③', '②④', '②⑤', '②⑥', '②⑦', '②⑧', '②⑩', '②a', '②b', '②d', '②e', '②f', '②g', '②h', '②i', '②j', '③①', '③⑩', '③a', '③b', '③c', '③d', '③e', '③f', '③g', '③h', '④a', '④b', '④c', '④d', '④e', '⑤a', '⑤b', '⑤d', '⑤e', '⑤f', '12', 'li', 'zxfitl'] not in stop_words.
'stop_words.' % sorted(inconsistent))
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha = 1).fit(x_train,y_train)
# 训练集测试
result = clf.predict(x_train)
count = 0
for i in range(len(result)):
if result[i] == y_train[i]:
count += 1
print(count/len(x_train))
0.9805
result = clf.predict(x_test)
count = 0
for i in range(len(result)):
if result[i] == y_test[i]:
count += 1
print(count/len(x_test))
0.91