知识点普及
-
文章自动摘要主要算法
- 获取需要制作摘要的文章
- 对该文章进行词频统计
- 对该文章进行分句
根据中文的标点符号,一般我们用'。'、'?'等进行分词
- 计算分句与文章的余弦相似度
- 取相似度最高的分句作为文章的摘要
样例代码
# -*- coding:utf-8 -*-
import re
import os
import jieba
import codecs
import numpy
import pandas
import os.path
from sklearn.metrics import pairwise_distances
from sklearn.feature_extraction.text import CountVectorizer
contents = []
# 创建语料库
with codecs.open('../data/input/war_cp.txt','r','utf-8') as f_in:
contents.append(f_in.read())
corpos = pandas.DataFrame({
'content':contents
})
#停用词导入
stopwords = pandas.read_csv(
"../data/input/StopwordsCN.txt",
encoding='utf8',
index_col=False,
quoting=3,
sep="\t"
)
# 提取关键词,并移除停用词
countVectorizer = CountVectorizer(
stop_words=list(stopwords['stopword'].values),
min_df=0, token_pattern=r"\b\w+\b"
)
contents = []
summarys = []
filePaths = []
for index,row in corpos.iterrows():
fileContent = row['content']
#建立子语料库,以该文档和该文档的分句组成
subCorpos = [fileContent]+ re.split(
r'[。?!\n]\s*',
fileContent
)
segments = []
suitCorpos = []
for content in subCorpos:
segs = jieba.cut(content)
segment = " ".join(segs)
if len(segment.strip())>10:
segments.append(segment)
suitCorpos.append(content)
# 生成特征矩阵textVector
textVector = countVectorizer.fit_transform(segments)
#计算分句和文章间的余弦相似度
distance_matrix = pairwise_distances(
textVector,
metric="cosine"
)
# 对生成的距离矩阵排序
sort = numpy.argsort(distance_matrix, axis=1)
summary = pandas.Index(suitCorpos)[sort[0]].values[1]
summarys.append(summary)
contents.append(fileContent)
summaryDF = pandas.DataFrame({
'content': contents,
'summary': summarys
})
print(summaryDF)
我是阿羽,一枚正在学习的搬砖小工,希望大家多多指教!