NLP1(朴素贝叶斯文本分类)
朴素贝叶斯文本分类小demo(通过文本的特征预测文本所属分类)
数据集下载:链接:https://pan.baidu.com/s/1zxrKtTYli2iQgK1iNVP9PQ 提取码:la3w
目的:通过文本的特征预测文本所属分类。
1.导包
import os
import re
import jieba
import pandas as pd
2.加载数据
root="data/百度题库/高中_历史/origin"
ancient_his_df=pd.read_csv(os.path.join(root,"古代史.csv"))
contemporary_his_df=pd.read_csv(os.path.join(root,'现代史.csv'))
modern_his_df=pd.read_csv(os.path.join(root,'近代史.csv'))
3.数据预处理
def segment_line(line):
line=re.sub("[a-zA-Z0-9]|[\s+\-\|\!\/\[\]\{\}_,.$%^*(+\"\')]+|[::+——()?【】《》“”!,。?、[email protected]#¥%……&*()]+|题目", '',line)
tokens=jieba.cut(line,cut_all=False)
return " ".join(tokens)
ancient_his_df['item']=ancient_his_df['item'].apply(lambda x:segment_line(x))
contemporary_his_df['item']=contemporary_his_df['item'].apply(lambda x:segment_line(x))
modern_his_df['item']=modern_his_df['item'].apply(lambda x:segment_line(x))
添加标签
ancient_his_df['label']=0
contemporary_his_df['label']=1
modern_his_df['label']=2
4.整合数据集
dataset_df=pd.concat([ancient_his_df,contemporary_his_df,modern_his_df])
5.特征提取
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransform
corpus=dataset_df['item']#数据集
vectorizer=TfidfVectorizer(max_features=2500,min_df=5)#选取tfidf最大的2500个特征,去除小于5的特征
X=vectorizer.fit_transform(corpus)
6.建立贝叶斯模型
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X.toarray(),dataset_df['labels'],test_size=0.2,random_state=4)
from sklearn.naive_bayes import GuassianNB,MultinomialNB,ComplementNB,,BernoulliNB
#高斯朴素贝叶斯
gnb=GaussianNB()
gnb=gnb.fit(X_train,y_train)
gnb_y_pred=gnb.predict(X_test)
#多项分布朴素贝叶斯
clf = MultinomialNB()
clf=clf.fit(X_train, y_train)
clf_y_pred=clf.predict(X_test)
#补充朴素贝叶斯
cnb = ComplementNB()
cnb=cnb.fit(X_train, y_train)
cnb_y_pred=cnb.predict(X_test)
#伯努利朴素贝叶斯
bnb = BernoulliNB()
bnb=bnb.fit(X_train, y_train)
bnb_y_pred=bnb.predict(X_test)
7.评估
from sklearn.metrics import classification_report
print("GuassianNB classification_report:\n")
print(classification_report(y_test,gnb_y_pred))
print('MultinomialNB classification_report: \n')
print(classification_report(y_test, clf_y_pred))
print('ComplementNB classification_report: \n')
print(classification_report(y_test, cnb_y_pred))
print('BernoulliNB classification_report: \n')
print(classification_report(y_test, bnb_y_pred))
结果:
GauusianNB classification_report: precision recall f1-score support 0 0.68 0.83 0.75 213 1 0.70 0.47 0.56 451 2 0.58 0.76 0.66 330 accuracy 0.64 994 macro avg 0.65 0.69 0.65 994 weighted avg 0.66 0.64 0.63 994 MultinomialNB classification_report: precision recall f1-score support 0 0.94 0.89 0.91 213 1 0.77 0.83 0.80 451 2 0.76 0.71 0.74 330 accuracy 0.80 994 macro avg 0.82 0.81 0.82 994 weighted avg 0.80 0.80 0.80 994 ComplementNB classification_report: precision recall f1-score support 0 0.85 0.92 0.89 213 1 0.80 0.76 0.78 451 2 0.74 0.75 0.74 330 accuracy 0.79 994 macro avg 0.80 0.81 0.80 994 weighted avg 0.79 0.79 0.79 994 BernoulliNB classification_report: precision recall f1-score support 0 0.96 0.86 0.91 213 1 0.79 0.80 0.79 451 2 0.73 0.76 0.75 330 accuracy 0.80 994 macro avg 0.83 0.81 0.82 994 weighted avg 0.81 0.80 0.80 994