#!/usr/bin/python3.6
# -*- coding: utf-8 -*-
import re
import os
import csv
# CSV的当前路径
ALL_NEWS_CSV_PATH = "C:/Users/86177/Desktop/ALLNEWS数据/news_csv/allNews.csv" # csv文件的路径
# CSV的当前路径
ALL_NEWS_SOURCE_DIR = r'C:/Users/86177/Desktop/ALLNEWS数据/allNews/allNews'
# 保存路径
ALL_NEWS_SAVE_DIR = r'D:/data'
def read_news(news_path):
"""
:param news_path: 新闻文件路径
:return:
"""
with open(news_path, "r") as fr: # 以读的方式打开
content = fr.read() # 读取文件到内存
# print(content)
# 时间分割类型(如2017.01.06)
# content_item_list = re.split(r"(\d{4}[.-]\d{1,2}[.-]\d{1,2})", content)
# 打印匹配内容
# for item in content_item_list:
# pass
# print(item)
# 如果规则生效,返回匹配内容
# if content_item_list:
# return content_item_list
# 有序标题分割 (如一、xxxx)
# 后续扩展规则
# 默认按照换行分割
content_item_list = content.split('\n\n') # 换行符待定,windows默认\r\n
print(content_item_list)
return content_item_list
def read_csv(csv_path):
"""
:param csv_path: csv导航文件路径
:return:
"""
news_summary_list = [] # 存储新闻总表
with open(csv_path, "r") as fr: # 同时打开多个文件
reader = csv.reader(fr) # 读取csv文件
for line in reader: # 循环读取每一行
# print(type(line))
# print(line)
# news_item_list = line.split(',') # 存储单项新闻检索信息
news_summary_list.append(line)
return news_summary_list
def save_news(save_dir, news_name, news_title, content_item_list):
"""
保存一条新闻内容
:param save_dir:
:param news_name:
:param news_title:
:param content_item_list:
:return:
"""
# 步骤一,保存标题
save_news_name_title = news_name + '-' + '1' + '.txt'
save_news_path_title = os.path.join(save_dir, save_news_name_title)
with open(save_news_path_title, "w+") as fw: # 追加写入标题到文件
fw_title = fw.write(news_title) # 把标题内容写到fw里
for index, item in enumerate(content_item_list):
save_news_name_content = news_name + '-' + str(index + 2) + '.txt'
save_news_path_content = os.path.join(save_dir, save_news_name_content)
with open(save_news_path_content, "w+") as fw: # 追加写入新闻内容到文件
fw_news = fw.write(item) # 把每一段内容写到fw里
def save_all_news(news_summary_list):
for index, item in enumerate(news_summary_list):
try:
news_name = item[0]
news_title = item[1]
news_category = item[3]
news_file_name = item[0] + '.txt'
news_path = os.path.join(ALL_NEWS_SOURCE_DIR, news_file_name)
content_item_list = read_news(news_path)
dst_save_dir = os.path.join(ALL_NEWS_SAVE_DIR, news_category)
if not os.path.exists(dst_save_dir):
os.makedirs(dst_save_dir)
save_news(dst_save_dir, news_name, news_title, content_item_list)
except Exception as err:
print(err)
continue
def main():
# csv_path = os.path.join(ALL_NEWS_SOURCE_DIR, 'allNews.csv')
news_summary_list = read_csv(ALL_NEWS_CSV_PATH)
save_all_news(news_summary_list)
if __name__ == '__main__':
main()
修正版:
#!/usr/bin/python3.6
# -*- coding: utf-8 -*-
import re
import os
import csv
# CSV的当前路径
ALL_NEWS_CSV_PATH = "C:/Users/86177/Desktop/ALLNEWS数据/news_csv/allNews.csv" # csv文件的路径
# CSV的当前路径
ALL_NEWS_SOURCE_DIR = r'C:/Users/86177/Desktop/ALLNEWS数据/allNews/allNews'
# 保存路径
ALL_NEWS_SAVE_DIR = r'D:/data'
def read_news(news_path):
"""
:param news_path: 新闻文件路径
:return:
"""
with open(news_path, "r") as fr: # 以读的方式打开
content = fr.read() # 读取文件到内存
print(content)
# 时间分割类型(如2017.01.06)
# content_item_list = re.split(r"(\d{4}[.-]\d{1,2}[.-]\d{1,2})", content)
# 打印匹配内容
# for item in content_item_list:
# pass
# print(item)
# 如果规则生效,返回匹配内容
# if content_item_list:
# return content_item_list
# 有序标题分割 (如一、xxxx)
# 后续扩展规则
# 默认按照换行分割,去掉首尾空格
content_item_list = content.strip().split('\n\n') # 换行符待定,windows默认\r\n
# print(content_item_list)
# 加筛选条件清除空文件和无意义文件
return content_item_list
def read_csv(csv_path):
"""
:param csv_path: csv导航文件路径
:return:
"""
news_summary_list = [] # 存储新闻总表
with open(csv_path, "r") as fr: # 同时打开多个文件
reader = csv.reader(fr) # 读取csv文件
for line in reader: # 循环读取每一行
# print(type(line))
# print(line)
# news_item_list = line.split(',') # 存储单项新闻检索信息
news_summary_list.append(line)
return news_summary_list
def save_news(save_dir, news_name, news_title, content_item_list):
"""
保存一条新闻内容
:param save_dir:
:param news_name:
:param news_title:
:param content_item_list:
:return:
"""
# 步骤一,保存标题
save_news_name_title = news_name + '-' + '1' + '.txt'
save_news_path_title = os.path.join(save_dir, save_news_name_title)
with open(save_news_path_title, "w+") as fw: # 追加写入标题到文件
fw_title = fw.write(news_title) # 把标题内容写到fw里
for index, item in enumerate(content_item_list):
save_news_name_content = news_name + '-' + str(index + 2) + '.txt'
save_news_path_content = os.path.join(save_dir, save_news_name_content)
with open(save_news_path_content, "w+") as fw: # 追加写入新闻内容到文件
fw_news = fw.write(item) # 把每一段内容写到fw里
def save_all_news(news_summary_list):
for index, item in enumerate(news_summary_list):
try:
news_name = item[0]
news_title = item[1]
news_category = item[3]
news_file_name = item[0] + '.txt'
news_path = os.path.join(ALL_NEWS_SOURCE_DIR, news_file_name)
content_item_list = read_news(news_path)
dst_save_dir = os.path.join(ALL_NEWS_SAVE_DIR, news_category)
if not os.path.exists(dst_save_dir):
os.makedirs(dst_save_dir)
save_news(dst_save_dir, news_name, news_title, content_item_list)
except Exception as err:
print(err)
continue
def main():
# csv_path = os.path.join(ALL_NEWS_SOURCE_DIR, 'allNews.csv')
news_summary_list = read_csv(ALL_NEWS_CSV_PATH)
save_all_news(news_summary_list)
if __name__ == '__main__':
main()
版本3:清除空文件和小于4个字符的文件
#!/usr/bin/python3.6
# -*- coding: utf-8 -*-
import os
import csv
# CSV的当前路径
ALL_NEWS_CSV_PATH = "C:/Users/86177/Desktop/ALLNEWS数据/news_csv/allNews.csv" # csv文件的路径
# CSV的当前路径
ALL_NEWS_SOURCE_DIR = r'C:/Users/86177/Desktop/ALLNEWS数据/allNews/allNews'
# 保存路径
ALL_NEWS_SAVE_DIR = r'D:/data'
def read_news(news_path):
"""
:param news_path: 新闻文件路径
:return:
"""
with open(news_path, "r") as fr: # 以读的方式打开
content = fr.readlines() # 读取文件到内存
content_item_list = list() # 新建空列表保存line(str)
for line in content: # 依次读取每行
line = line.strip() # 去掉每行头尾空白
if not len(line) or len(line) < 4: # 判断是否是空行或小于四个字符
continue # 是的话,跳过不处理
content_item_list.append(line)
print(content)
# 时间分割类型(如2017.01.06)
# content_item_list = re.split(r"(\d{4}[.-]\d{1,2}[.-]\d{1,2})", content)
# 打印匹配内容
# for item in content_item_list:
# pass
# print(item)
# 如果规则生效,返回匹配内容
# if content_item_list:
# return content_item_list
# 有序标题分割 (如一、xxxx)
# 后续扩展规则
# 默认按照换行分割,去掉首尾空格
# content_item_list = content.strip().split('\n\n') # 第一种切分方法,windows默认\r\n
# print(content_item_list)
# 加筛选条件清除空文件和无意义文件
return content_item_list
def read_csv(csv_path):
"""
:param csv_path: csv导航文件路径
:return:
"""
news_summary_list = [] # 存储新闻总表
with open(csv_path, "r") as fr: # 同时打开多个文件
reader = csv.reader(fr) # 读取csv文件
for line in reader: # 循环读取每一行
# print(type(line))
# print(line)
# news_item_list = line.split(',') # 存储单项新闻检索信息
news_summary_list.append(line)
return news_summary_list
def save_news(save_dir, news_name, news_title, content_item_list):
"""
保存一条新闻内容
:param save_dir:
:param news_name:
:param news_title:
:param content_item_list:
:return:
"""
# 步骤一,保存标题
save_news_name_title = news_name + '-' + '1' + '.txt'
save_news_path_title = os.path.join(save_dir, save_news_name_title)
with open(save_news_path_title, "w+") as fw: # 追加写入标题到文件
fw_title = fw.write(news_title) # 把标题内容写到fw里
for index, item in enumerate(content_item_list):
save_news_name_content = news_name + '-' + str(index + 2) + '.txt'
save_news_path_content = os.path.join(save_dir, save_news_name_content)
with open(save_news_path_content, "w+") as fw: # 追加写入新闻内容到文件
fw_news = fw.write(item) # 把每一段内容写到fw里
def save_all_news(news_summary_list):
for index, item in enumerate(news_summary_list):
try:
news_name = item[0]
news_title = item[1]
news_category = item[3]
news_file_name = item[0] + '.txt'
news_path = os.path.join(ALL_NEWS_SOURCE_DIR, news_file_name)
content_item_list = read_news(news_path)
dst_save_dir = os.path.join(ALL_NEWS_SAVE_DIR, news_category)
if not os.path.exists(dst_save_dir):
os.makedirs(dst_save_dir)
save_news(dst_save_dir, news_name, news_title, content_item_list)
except Exception as err:
print(err)
continue
def main():
# csv_path = os.path.join(ALL_NEWS_SOURCE_DIR, 'allNews.csv')
news_summary_list = read_csv(ALL_NEWS_CSV_PATH)
save_all_news(news_summary_list)
if __name__ == '__main__':
main()
格式整理版本
#!/usr/bin/python3,6 #指定解释器
# -*- coding: utf-8 -*- #指定编码
import os
import csv
# CSV的当前路径
ALL_NEWS_CSV_PATH = r"C:/Users/86177/Desktop/ALLNEWS数据/news_csv/allNews.csv" # csv文件的路径
# CSV的当前路径
ALL_NEWS_SOURCE_DIR = r'C:/Users/86177/Desktop/ALLNEWS数据/allNews/allNews'
# 保存路径
ALL_NEWS_SAVE_DIR = r'D:/data'
def read_news(news_path):
"""
:param news_path:csv导航文件路径
:return: content_item_list
"""
with open(news_path, 'r') as fr:
content = fr.read()
print(content)
content_item_list = content.strip().split('\n\n')
return content_item_list
def read_csv(csv_path):
"""
:param csv_path:
:return:news_summary_list
"""
news_summary_list = [] # 存储新闻总表
with open(csv_path, 'r') as fr: # 同时打开多个文件
reader = csv.reader(fr) # 读取csv文件
for line in reader: # 循环读取每一行
news_summary_list.append(line)
return news_summary_list
def save_all_news(news_summary_list):
"""
:param news_summary_list:
:return:
"""
# (从csv文件获取每个新闻的id和标题,种类)
for index, item in enumerate(news_summary_list):
try:
news_name = item[0]
news_title = item[1]
news_category = item[3]
news_file_name = item[0] + '.txt' # 拼接news保存的格式
news_path = os.path.join(ALL_NEWS_SOURCE_DIR, news_file_name) # 拼接news保存路径
content_item_list = read_news(news_path) # 调用read_news函数
dst_save_dir = os.path.join(ALL_NEWS_SOURCE_DIR, news_category) # 创建种类文件夹
if not os.path.exists(dst_save_dir):
os.makedirs(dst_save_dir)
# 调用save_news 保存文件
save_news(dst_save_dir, news_name, news_title, content_item_list)
except Exception as err: # 抛出异常
print(err)
continue
def save_news(save_dir, news_name, news_title, content_item_list):
"""
:param save_dir: ???
:param news_name:
:param news_title:
:param content_item_list:
:return:
"""
# 第一步;保存标题(csv文件),#从csv文件读取数据保存格式为:9354665-1,txt
save_news_name_title = news_name + '-' + '1' + '.txt'
save_news_path_title = os.path.join(save_dir, save_news_name_title)
with open(save_news_name_title, "w+") as fw: # 追加写入到标题
fw_title = fw.write(news_title) # 把标题内容写到fw里
for index, item in enumerate(content_item_list):
save_news_name_content = news_name + '-' + str(index + 2) + '.txt' # 自动编码
save_news_path_content = os.path.join(save_dir, save_news_name_content) # txt文件保存的路径
with open(save_news_path_content, "w+") as fw: # 追加写入新闻内容到文件
fw_news = fw.write(item) # 把每一段内容写到fw里
def main():
news_summary_list = read_csv(ALL_NEWS_CSV_PATH)
save_all_news(news_summary_list)
if __name__ == '__main__':
main()
第四版
358543.txt_0001
#!/usr/bin/python3.6
# -*- coding: utf-8 -*-
import os
import csv
import glob
# CSV的当前路径
ALL_NEWS_CSV_PATH = r"C:/Users/86177/Desktop/ALLNEWS数据/news_csv/allNews.csv" # csv文件的路径
# CSV的当前路径
ALL_NEWS_SOURCE_DIR = r'C:/Users/86177/Desktop/ALLNEWS数据/allNews/allNews'
# 保存路径
ALL_NEWS_SAVE_DIR = r'D:/data003'
def read_news(news_path):
"""
:param news_path: 新闻文件路径
:return:
"""
# summary_content = []
with open(news_path, "r") as fr: # 以读的方式打开
content = fr.readlines() # 读取文件到内存
# summary_content.append(content)
# if len(summary_content) <= 10000:
# # continue
content_item_list = list() # 新建空列表保存line(str)
for line in content: # 依次读取每行
line = line.strip() # 去掉每行头尾空白
if not len(line) or len(line) < 8: # 判断是否是空行或小于四个字符
continue # 是的话,跳过不处理
content_item_list.append(line)
print(content)
# 时间分割类型(如2017.01.06)
# content_item_list = re.split(r"(\d{4}[.-]\d{1,2}[.-]\d{1,2})", content)
# 打印匹配内容
# for item in content_item_list:
# pass
# print(item)
# 如果规则生效,返回匹配内容
# if content_item_list:
# return content_item_list
# 有序标题分割 (如一、xxxx)
# 后续扩展规则
# 默认按照换行分割,去掉首尾空格
# content_item_list = content.strip().split('\n\n') # 第一种切分方法,windows默认\r\n
# print(content_item_list)
# 加筛选条件清除空文件和无意义文件
return content_item_list
def read_csv(csv_path):
"""
:param csv_path: csv导航文件路径
:return:
"""
news_summary_list = [] # 存储新闻总表
with open(csv_path, "r") as fr: # 同时打开多个文件
reader = csv.reader(fr) # 读取csv文件
for line in reader: # 循环读取每一行
# print(type(line))
# print(line)
# news_item_list = line.split(',') # 存储单项新闻检索信息
news_summary_list.append(line)
return news_summary_list
def save_news(save_dir, news_name, news_title, content_item_list):
"""
保存一条新闻内容
:param save_dir:
:param news_name:
:param news_title:
:param content_item_list:
:return:
"""
# 步骤一,保存标题
save_news_name_title = news_name + '.txt' + '_' + '0001'
save_news_path_title = os.path.join(save_dir, save_news_name_title)
with open(save_news_path_title, "w+") as fw: # 追加写入标题到文件
fw_title = fw.write(news_title) # 把标题内容写到fw里
for index, item in enumerate(content_item_list):
# f1 = new DecimalFormat("0000");
# String_paragraphName = strFileName + "_" + f1.format(i);这是落文本编号能不能format一下
index_length = len(str(index + 2))
# 定制文件名
if index_length == 1:
save_news_name_content = news_name + '.txt' + '_000' + str(index + 2)
elif index_length == 2:
save_news_name_content = news_name + '.txt' + '_00' + str(index + 2)
elif index_length == 3:
save_news_name_content = news_name + '.txt' + '_0' + str(index + 2)
else:
save_news_name_content = news_name + '.txt' + '_' + str(index + 2)
save_news_path_content = os.path.join(save_dir, save_news_name_content)
with open(save_news_path_content, "w+") as fw: # 追加写入新闻内容到文件
fw_news = fw.write(item) # 把每一段内容写到fw里
def save_all_news(news_summary_list):
for index, item in enumerate(news_summary_list):
try:
news_name = item[0]
news_title = item[1]
news_category = item[3]
news_file_name = item[0] + '.txt'
# 加一个约束,只读前一千个文件
p = list()
while len(p.append(news_file_name)) <= 1000:
news_path = os.path.join(ALL_NEWS_SOURCE_DIR, news_file_name)
content_item_list = read_news(news_path)
dst_save_dir = os.path.join(ALL_NEWS_SAVE_DIR, news_category) # 创建种类文件夹
if not os.path.exists(dst_save_dir):
os.makedirs(dst_save_dir)
save_news(dst_save_dir, news_name, news_title, content_item_list)
except Exception as err:
print(err)
continue
def main():
# csv_path = os.path.join(ALL_NEWS_SOURCE_DIR, 'allNews.csv')
news_summary_list = read_csv(ALL_NEWS_CSV_PATH)
save_all_news(news_summary_list)
if __name__ == '__main__':
main()