1. 题目
第 0006 题:你有一个目录,放了你一个月的日记,都是 txt,为了避免分词的问题,假设内容都是英文,请统计出你认为每篇日记最重要的词。
2. 扩展
- 利用堆来实现统计Top K的重要的词
- 实现可扩展的堆(支持节点可以为任意类型)
3. 实现堆
class MinHeap:
def __init__(self):
self.heap_list = [()]
self.size = 0
def compare(self, item1, item2):
if item1[1] < item2[1]:
return 1
elif item1[1] == item2[1]:
return 0
else:
return -1
def flow_up(self, i):
while i // 2 > 0:
if self.compare(self.heap_list[i], self.heap_list[i//2]) < 0:
tmp = self.heap_list[i//2]
self.heap_list[i//2] = self.heap_list[i]
self.heap_list[i] = tmp
i //= 2
def insert(self, item):
self.heap_list.append(item)
self.size += 1
self.flow_up(self.size)
def flow_down(self, i):
while i*2 <= self.size:
min_child = self.get_min_child(i)
if self.compare(self.heap_list[i], self.heap_list[min_child]) > 0:
tmp = self.heap_list[min_child]
self.heap_list[min_child] = self.heap_list[i]
self.heap_list[i] = tmp
i = min_child
def get_min_child(self, i):
if i*2+1 > self.size:
return i*2
else:
if self.compare(self.heap_list[i*2], self.heap_list[i*2+1]) < 0:
return i*2
else:
return i*2+1
def pop_min(self):
min_item = self.heap_list[1]
self.heap_list[1] = self.heap_list[self.size]
self.size -= 1
self.heap_list.pop()
self.flow_down(1)
return min_item
def build_heap(self, word_dict):
i = len(word_dict) // 2
self.size = len(word_dict)
for item in word_dict.items():
self.heap_list.append(item)
while i > 0:
self.flow_down(i)
i -= 1
注意:当需要使用堆时,只需要继承这个类,重写compare()
和build_heap()
方法即可。
4. 实现选词
# -*- coding:utf-8 -*-
import re
import os
import os.path
from min_heap import MinHeap
def get_word_dic(file_path=None):
if file_path is None:
print("Error")
return
word_dict = {}
with open(file_path, "r", encoding="utf-8") as file:
for line in file.readlines():
words = re.findall(r"[a-z\'_-]+\b", line.lower())
for word in words:
if word not in word_dict:
word_dict[word] = 1
else:
word_dict[word] += 1
return word_dict
def get_top_k_words(word_dict, k):
result = []
dont_count = get_not_important_word_list("not-important-words.txt")
min_heap = MinHeap()
min_heap.build_heap(word_dict)
while k > 0:
item = min_heap.pop_min()
if item[0] not in dont_count:
result.append(item)
k -= 1
return result
def get_not_important_word_list(path):
with open(path, "r", encoding="utf-8") as file:
words = re.findall(r"[a-z\'_-]+", file.read().lower())
return words
def get_words_important_dict(dir_path):
if not os.path.isdir(dir_path):
print("plz input path")
return
files = [os.path.join(dir_path, x) for x in os.listdir(dir_path) if os.path.splitext(x)[1] == ".txt" and x != "not-important-words.txt"]
for file in files:
print("File: " + file)
word_dict = get_word_dic(file)
print(get_top_k_words(word_dict, 10))
if __name__ == "__main__":
get_words_important_dict(".")