import jieba
import math
import wordcloud
import matplotlib.pyplot as plt
#构建停用词列表
def stopword(path1):
file = open(path1,'r',encoding='utf-8')
stopwords = [line.strip() for line in file.readlines()]
return stopwords
#统计词频和高频词
def comment(path2,stopwords):
dic = {}
with open(path2,'r',encoding='utf-8') as f:
txt = f.readlines()
for line in txt:
word = jieba.lcut(line.strip('\n'))
for w in word:
if len(w)>1 and w not in stopwords:
dic[w] = dic.get(w,0) + 1
dic_order=sorted(dic.items(),key=lambda x:x[1],reverse=True)
print(dic_order)
lis = [k[0] for k in dic_order[:6]]
print(lis)
return lis,dic
#查看每条评论(文档)是否含有高频词
def matrics(path2,lis):
matric = []
with open(path2,'r',encoding='utf-8') as f:
txt = f.readlines()
for line in txt:
mat = [0 for i in range(6)]
word = jieba.lcut(line.strip('\n'))
#print(word)
for w in word:
if w in lis:
#print(w,end = ' ')
mat[lis.index(w)] = 1 #变成+=1就是累加
matric.append(mat)
#print()
return matric
#计算不同评论之间的距离
def distance(matric):
for i in matric:
for j in matric:
var = math.sqrt(sum([(i[k]-j[k])**2 for k in range(6)]))
#print(round(var,2),end =' ')
#print()
#计算所有评论的重心并输出
def center(matric):
point = [0 for i in range(6)]
for i in matric:
for j in range(6):
point[j] += i[j]/len(matric)
print('point = ',point)
#绘制词云
def wcloud(dic):
wc = wordcloud.WordCloud( # 根据词频字典生成词云图
max_words=200, # 最多显示词数
max_font_size=300, # 字体最大值
background_color="white", # 设置背景为白色,默认为黑色
width = 1500, # 设置图片的宽度
height= 960, # 设置图片的高度
margin= 10, # 设置图片的边缘
font_path='C:/Windows/SIMLI.TTF'
)
wc.generate_from_frequencies(dic) # 从字典生成词云
plt.imshow(wc) # 显示词云
plt.axis('off') # 关闭坐标轴
plt.show() # 显示图像
#wc.to_file(fp) # 保存图片
#定义主函数
def main():
path1 = 'stopwords_list.txt'
path2 = 'jd_comments.txt'
stopwords = stopword(path1)
lis,dic = comment(path2,stopwords)
matric = matrics(path2,lis)
distance(matric)
center(matric)
wcloud(dic)
#主函数调用执行
if __name__ == '__main__':
main()
部分高频词统计
[('不错', 451), ('电脑', 332), ('非常', 297), ('没有', 236), ('速度', 234), ('客服', 187), ('问题', 186), ('开机', 183), ('京东', 180), ('满意', 176), ('感觉', 166), ('很快', 149), ('收到', 147), ('东西', 135), ('系统', 133), ('真的', 128), ('使用', 126), ('键盘', 118), ('喜欢', 115), ('包装', 113), ('有点', 111), ('比较', 105), ('hellip', 105), ('外观
', 104), ('游戏', 101), ('效果', 101), ('硬盘', 100), ('物流', 98), ('价格', 97), ('快递', 97), ('屏幕', 97), ('性能', 93), ('机器', 88), ('流畅', 88), ('性价比', 87), ('一下', 87), ('运行', 85), ('安装', 84), ('购买', 82), ('评价', 82), ('值得', 81), ('鼠标', 80), ('方便', 78), ('不是', 78), ('一次', 78), ('知道', 77), ('配置', 77), ('总体', 77), ('耐
心', 76), ('卖家', 74), ('好评', 74), ('现在', 73), ('笔记本', 73), ('特别', 73), ('推荐', 69), ('第一次', 68), ('希望
', 67), ('来说', 65), ('一点', 65), ('服务', 64), ('质量', 64), ('固态', 63), ('内存', 63), ('很多', 63), ('已经', 62), ('软件', 62), ('清晰', 61), ('以后', 60), ('机子', 60), ('办公', 59), ('声音', 58), ('购物', 58), ('需要', 58), ('几
天', 58), ('之前', 57), ('发货', 53), ('朋友', 53), ('打开', 53), ('散热', 53), ('店家', 52), ('态度', 52), ('完美', 52), ('觉得', 52), ('最后', 51), ('做工', 51), ('好看', 50), ('支持', 50), ('玩游戏', 50),
特征词组成的特征集
['不错', '电脑', '非常', '没有', '速度', '客服']
部分评论的“坐标”
[1, 0, 0, 0, 1, 0]
[1, 1, 1, 0, 0, 0]
[1, 0, 1, 1, 1, 0]
[0, 1, 0, 0, 0, 0]
[0, 1, 1, 0, 0, 1]
[1, 1, 0, 0, 1, 1]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 1]
[1, 0, 0, 0, 1, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 1, 0, 0]
[1, 0, 0, 0, 0, 0]
[0, 0, 1, 0, 0, 1]
[1, 0, 0, 1, 0, 0]
[0, 1, 0, 0, 0, 1]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[1, 0, 0, 0, 0, 0]
[1, 0, 1, 1, 0, 0]
[0, 0, 0, 0, 1, 0]
[1, 1, 0, 0, 1, 0]
[0, 1, 0, 0, 0, 0]
[0, 0, 1, 0, 0, 1]
[0, 1, 0, 1, 0, 1]
[0, 0, 0, 0, 0, 0]
[0, 1, 1, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 1, 1, 0, 0, 0]
[0, 0, 0, 0, 1, 0]
[0, 1, 1, 1, 0, 0]
[1, 1, 1, 0, 0, 0]
[0, 1, 0, 0, 1, 0]
[0, 0, 0, 0, 0, 0]
[1, 1, 0, 1, 0, 0]
[1, 0, 0, 0, 1, 0]
[1, 0, 0, 1, 1, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 1, 0, 0]
[0, 0, 1, 0, 0, 0]
[0, 0, 0, 0, 1, 0]
[0, 0, 0, 1, 1, 0]
所有评论的“重心”
point = [0.3383233532934152, 0.23253493013972026, 0.19960079840319336, 0.19161676646706563, 0.19161676646706563, 0.1447105788423152]
词云可视化