结巴分词是对中文语言进行处理的一个Python模块
import jieba
luca = jieba.cut('遇见你真的是太好了')
print(list(luca))#直接使用jieba.cut()返回的不是列表,需要用list()或set(),''.join()等方式输出,或使用jieba.lcut()
import jieba.analyse
for x, w in jieba.analyse.extract_tags(text):#可以再添加一个参数指定输出个数
print(x, w)#直接输出关键词和词频
jieba分词的这个函数返回的高频词效果不太理想,在实际中我没使用它,事实上也没使用analyse的停词方法,而是手动进行了停词处理
def rm_char1(text1):
text1 = re.sub('\u3000', '', text1)
return text1
def rm_char2(text2):
text2 = re.sub('\xa0', '', text2)
return text2
def get_stop_words():
# stop_words中,每行放一个停用词,以\n分隔
with open('F:\jieba\stop_words.txt','r', encoding = 'utf8') as f:
file = f.read().split('\n')
return set(file)
def rm_tokens(words): # 去掉一些停用词和数字
words_list = list(words)
stop_words = get_stop_words()
for i in range(words_list.__len__())[::-1]:
if words_list[i] in stop_words: # 去除停用词
words_list.pop(i)
elif words_list[i].isdigit():
words_list.pop(i)
return words_list
def convert_text_to_wordlist(str_doc):
# 分词的主要方法
sent_list = str_doc.split('\n')
sent_list = map(rm_char1, sent_list) # 去掉一些字符,例如\u3000
sent_list = map(rm_char2, sent_list) # 去掉\xa0
word_2dlist = [rm_tokens(jieba.cut(part)) for part in sent_list] # 分词
word_list = sum(word_2dlist,[])
return word_list
使用的话直接
luca = convert_text_to_wordlist('遇见你真的是太好了')
import jieba
import jieba.analyse
import csv
from sklearn import feature_extraction
def get_dataset():
data, targetdata = [], []
with open('D:\datatrain.csv', 'r', encoding='gb18030') as file:
f = csv.reader(file)
for line in f:
seglist = jieba.cut(line[2])
words = ' '.join(seglist)
data.append(words)
targetdata.append(1) if 'T' in line[1] or 't' in line[1] else targetdata.append(0)
return data,targetdata
def get_testset():
testdata, targettest = [], []
with open('D:\datatest.csv', 'r',encoding='gb18030') as file:
f = csv.reader(file)
for line in f:
seglist = jieba.cut(line[2])
words = ' '.join(seglist)
testdata.append(words)
targettest.append(1) if 'T' in line[1] or 't' in line[1] else targettest.append(0)
return testdata, targettest
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
def data_pro():
data_,target_train = get_dataset()
testdata, target_test = get_testset()
v = TfidfVectorizer()
train_data = v.fit_transform(data_)
test_data = v.transform(testdata)
return train_data, test_data
datapro()
clf = MultinomialNB(alpha=0.01)
clf.fit(train_data,target_train)
pred = clf.predict(testdata)
'''#辣鸡不如Naive bayes
svc = svm.SVC(kernel='linear')
svc.fit(train_data,target_train)
pred = svc.predict(testdata)
'''
count=0
for l,r in zip(pred, target_test):
if l == r:
count +=1
print(count/len(target_test))#输出正确率