NLP--day(5)(贝叶斯)

贝叶斯公式

NLP--day(5)(贝叶斯)

贝叶斯模型描述

给定条件

假设我们的分类模型样本是:
NLP--day(5)(贝叶斯)
代表有m个样本,每个样本有n个特征,特征输出有K个类别,定义为C1,C2,.....CkC_1,C_2,.....C_k

目标

在以上给定条件后,我们希望贝叶斯模型能通过给定样本 Xtest=(x1test,x2test,....xntest)X^{test}=(x_1^{test},x_2^{test},....x_n^{test})
,通过后验概率最大化来判断分类,预测出P(Y=CKX=Xtest)P(Y=C_K|X=X^{test})

推理过程

已知要求P(Y=CKX=Xtest)P(Y=C_K|X=X^{test}),根据贝叶斯公司可得:
P(Y=CKX=Xtest)=P(X=XtestYk)P(Y=Ck)kP(X=XtestY=Ck)P(Y=Ck)P(Y=C_K|X=X^{test})=\frac{P(X=X^{test}|Y_k)P(Y=C_k)}{\displaystyle\sum_{k} {P(X=X^{test}|Y=C_k)P(Y=C_k)}}
CresultC_{result}是使P(Y=CKX=Xtest)P(Y=C_K|X=X^{test}),最大化的类别,数学表达式为:
NLP--day(5)(贝叶斯)
NLP--day(5)(贝叶斯)

算法过程

NLP--day(5)(贝叶斯)

朴素贝叶斯优缺点

优点

  • 算法逻辑简单,易于实现;
  • 分类过程中时空开销小(假设特征相互独立,只会涉及到二维存储)

缺点

理论上,朴素贝叶斯模型与其他分类方法相比具有最小的误差率。但是实际上并非总是如此,这是因为朴素贝叶斯模型假设属性之间相互独立,这个假设在实际应用中往往是不成立的,在属性个数比较多或者属性之间相关性较大时,分类效果不好。

使用数据类型:标称型数据

贝叶斯文本分类

新闻文本分类

import random
import pandas as pd
import numpy as np
import jieba
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
with open("./cnews.train.txt", 'r', encoding="utf-8") as file:
    train = file.readlines()
with open("./cnews.test.txt", 'r', encoding="utf-8") as file:
    test = file.readlines()
def exactua(data):
    random.shuffle(data) #数据集打乱
    x_data = []
    y_data = []
    for line in data:
        x_data.append(line.replace('\n', '').split('\t')[1])
        y_data.append(line.replace('\n', '').split('\t')[0])
    return x_data, y_data
x_train_data, y_train_data = exactua(train)
x_test_data, y_test_data = exactua(test)
x_train_data, y_train_data = x_train_data[:2000], y_train_data[:2000]
x_test_data, y_test_data= x_test_data[:200], y_test_data[:200]
import jieba 

def stopwordslist():
    stopwords = []
    with open('stopwords.txt', 'r') as fr:
        for line in fr:
            stopwords.append(line[:-1])
    return stopwords
stopwords = stopwordslist()
def toke_stopwords(x_train_data,x_test_data_):
    #去停用词并进行结巴分词
    train_lists=[]
    for i in x_train_data:
        word_list = [word for word in jieba.cut(i) if word not in stopwords]
        train_lists.append(' '.join(word_list))
    test_lists=[]
    for i in x_test_data:
        word_list = [word for word in jieba.cut(i) if word not in stopwords]
        test_lists.append(' '.join(word_list))
    return train_lists, test_lists
        
x_train, x_test = toke_stopwords(x_train_data, x_test_data)
len(x_train)
# len(x_test)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\hp\AppData\Local\Temp\jieba.cache
Loading model cost 0.589 seconds.
Prefix dict has been built succesfully.

2000

def data2vec(x_train, x_test):
    from sklearn.feature_extraction.text import CountVectorizer

    count_vec=CountVectorizer(stop_words=stopwords) #创建词袋数据结构
    X_count_train = count_vec.fit_transform(x_train)  #<class 'scipy.sparse.csr.csr_matrix'>
    #train和test转化为特征向量
    X_count_train= X_count_train.toarray()
    X_count_test = count_vec.transform(x_test).toarray()   
    return X_count_train, X_count_test, dict(count_vec.vocabulary_)

x_train, x_test, wordDic = data2vec(x_train, x_test)

F:\anaconda1\envs\baseline\lib\site-packages\sklearn\feature_extraction\text.py:300: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['lex', '①①', '①②', '①③', '①④', '①⑤', '①⑥', '①⑦', '①⑧', '①⑨', '①a', '①b', '①c', '①d', '①e', '①f', '①g', '①h', '①i', '①o', '②①', '②②', '②③', '②④', '②⑤', '②⑥', '②⑦', '②⑧', '②⑩', '②a', '②b', '②d', '②e', '②f', '②g', '②h', '②i', '②j', '③①', '③⑩', '③a', '③b', '③c', '③d', '③e', '③f', '③g', '③h', '④a', '④b', '④c', '④d', '④e', '⑤a', '⑤b', '⑤d', '⑤e', '⑤f', '12', 'li', 'zxfitl'] not in stop_words.
  'stop_words.' % sorted(inconsistent))

x_train.shape

(2000, 63169)

print ('vocabulary:\n\n')
count = 0 
for key,value in wordDic.items():
    print(key,value)
    count += 1
    if count > 10:
        break

vocabulary:

原作者 18243
认可 54166
山楂树 27207
张伟 29182
平称 28198
影片 29708
慢热 31183
新浪 35516
娱乐 24974
开拍 28961
上映 7058

def y2label(y_train, y_test):
    dic = {}
    count = 0
    for i in set(y_train):
        dic[i] = count 
        count += 1
    y_train, y_test = [dic.get(i) for i in y_train_data], [dic.get(i) for i in y_test_data]
    return y_train, y_test, dic

y_train, y_test, dicLabel = y2label(y_train_data, y_test_data)

y_train[:3]

[7, 3, 7]

from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB(alpha = 0.01).fit(x_train,y_train)

#  训练集测试
result = clf.predict(x_train)
count = 0
for i in range(len(result)):
    if result[i] == y_train[i]:
        count += 1
print(count/len(x_train))

0.9985

#  测试集测试
result = clf.predict(x_test)
count = 0
for i in range(len(result)):
    if result[i] == y_test[i]:
        count += 1
print(count/len(x_test))

0.95

#结合Tf-idf 算法进行文本分类

import numpy as np
import random 
with open("./cnews.train.txt", 'r', encoding="utf-8") as file:
    train = file.readlines()
with open("./cnews.test.txt", 'r', encoding="utf-8") as file:
    test = file.readlines()
x_train_data, y_train_data = exactua(train)
x_test_data, y_test_data = exactua(test)
# 2000条 训练集
# 200条 测试
x_train_data, y_train_data = x_train_data[:2000], y_train_data[:2000]
x_test_data, y_test_data= x_test_data[:200], y_test_data[:200]
x_train, x_test = toke_stopwords(x_train_data, x_test_data)
y_train, y_test, dicLabel = y2label(y_train_data, y_test_data)

def data2vec(x_train, x_test):
    from sklearn.feature_extraction.text import TfidfTransformer 
    from sklearn.feature_extraction.text import CountVectorizer 
    
    count_vec=CountVectorizer(stop_words=stopwords) #创建词袋数据结构
    transformer = TfidfTransformer()
    X_count_train = count_vec.fit_transform(x_train)
    X_count_test = count_vec.transform(x_test)
    
    X_train_tfidf = transformer.fit_transform(X_count_train).toarray()
    X_test_tfidf = transformer.fit_transform(X_count_test).toarray()
    return X_train_tfidf, X_test_tfidf

x_train, x_test = data2vec(x_train, x_test)

F:\anaconda1\envs\baseline\lib\site-packages\sklearn\feature_extraction\text.py:300: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['lex', '①①', '①②', '①③', '①④', '①⑤', '①⑥', '①⑦', '①⑧', '①⑨', '①a', '①b', '①c', '①d', '①e', '①f', '①g', '①h', '①i', '①o', '②①', '②②', '②③', '②④', '②⑤', '②⑥', '②⑦', '②⑧', '②⑩', '②a', '②b', '②d', '②e', '②f', '②g', '②h', '②i', '②j', '③①', '③⑩', '③a', '③b', '③c', '③d', '③e', '③f', '③g', '③h', '④a', '④b', '④c', '④d', '④e', '⑤a', '⑤b', '⑤d', '⑤e', '⑤f', '12', 'li', 'zxfitl'] not in stop_words.
  'stop_words.' % sorted(inconsistent))

from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB(alpha = 1).fit(x_train,y_train)

#  训练集测试
result = clf.predict(x_train)
count = 0
for i in range(len(result)):
    if result[i] == y_train[i]:
        count += 1
print(count/len(x_train))

0.9805

result = clf.predict(x_test)
count = 0
for i in range(len(result)):
    if result[i] == y_test[i]:
        count += 1
print(count/len(x_test))

0.91