也可以参考这篇文章:包含了停用词以及模型评估
https://www.cnblogs.com/Lin-Yi/p/8974108.html
思路:
- 对原有文本进行分类标注
- 文本特征提取:CountVectorizer 和 TfidfVectorizer
CountVectorizer :只考虑词汇在文本中出现的频率
除了考量某词汇在文本出现的频率,还关注包含这个词汇的所有文本的数量,能够削减高频没有意义的词汇出现带来的影响, 挖掘更有意义的特征。- 使用朴素贝叶斯进行训练和预测
# -*- encoding:utf-8 -*-
import os
import jieba
from sklearn.feature_extraction.text import CountVectorizer
def loaddata(path,class1):
allfile=os.listdir(path)
textdata=[]
classall=[]
for thisfile in allfile:
data=open(path+"/"+thisfile,"r",encoding="gbk").read()
data1=jieba.cut(data)
data11=""
for item in data1:
data11+=item+" "
textdata.append(data11)
classall.append(class1)
return textdata,classall
text1,class1=loaddata("F:/work/Textming/文本/爱情故事",0)
text2,class2=loaddata("F:/work/Textming/文本/鬼故事",1)
train_text=text1+text2
classall=class1+class2
print(train_text)
print(classall)
count_vect = CountVectorizer()
train_x_counts = count_vect.fit_transform(train_text)
print('='*30)
print(train_x_counts)
#tfidf模型
from sklearn.feature_extraction.text import TfidfTransformer
tf_ts = TfidfTransformer(use_idf=False).fit(train_x_counts)
train_x_tf = tf_ts.transform(train_x_counts)
print('='*30)
print('train_x_tf',type(train_x_tf))
print(train_x_tf)
#训练
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(train_x_tf,classall)
#分类
new_text=["有鬼","爱情 等待","我 一直 在 等待","诡异 蜡烛","房间 有鬼"]
new_x_counts = count_vect.transform(new_text)
new_x_tfidf = tf_ts.transform(new_x_counts)
print(new_x_tfidf)
predicted = clf.predict(new_x_tfidf)
print(predicted)
输出结果:如下对new_text文本进行了分类
[1 0 0 0 1]