需求:将文件夹低下的excel表格导入到数据库(大概70个文件夹,每个excel有30多个sheet)
excel格式如下
1.使用xlrd模块,读取excel文件
import xlrd
import pymysql
filename = r'F:\资料\非主营业务\非主营业务73.xlsx'
rbook = xlrd.open_workbook(filename)
names = rbook.sheet_names()
2.提取需要的字段(code,name),code的第一位为6,在后面加SH(上海);code的第一位为0或者3,在后面加SZ(深圳)
#股票编码
code = cod[0:6]
if code[0] == '6':
code += '.SH'
elif code[0] in ['3', '0']:
code += '.SZ'
else:
continue
#公司姓名
name = cod[6:]
3.读取每一个表,将报错的sheet名字,类型写入txt文件
sheet = rbook.sheet_by_name(cod)
unit = sheet.row_values(0)[0]
if unit == 0:
with open('error.txt', 'a')as A:
A.write('第一个单元格为0:' + cod + '\n')
continue
if unit == '不存在' or unit == '不适用':
print(cod, '不存在####################')
continue
if unit == '单位可疑':
with open('error.txt', 'a')as A:
A.write('单位可疑:' + cod + '\n')
continue
#但第三行第一单元格不为空 和 等于项目的时候跳出
if sheet.row_values(2)[0] != '' and sheet.row_values(2)[0] != '项目':
with open('error.txt', 'a')as A:
A.write('错位:' + cod + '\n')
continue
4.获取每个表格行数sheet.nrows,并对表格的内容进行清洗
for i in range(sheet.nrows):
data = sheet.row_values(i)
#因为存在空值,把第四行第二单元格为金额拿出来
if i > 2 and sheet.row_values(3)[1] == '金额':
with open('error.txt', 'a')as A:
A.write('第四行异常:' + cod + '\n')
continue
for aa in range(len(data)):
data[aa] = str(data[aa])
data[aa] = data[aa].replace('\n', '')
data[aa] = data[aa].replace(' ', '')
data[aa] = data[aa].replace(',', '')
data[aa] = data[aa].replace('--', '')
data[aa] = data[aa].replace('否否', '否')
data[aa] = data[aa].replace('——', '')
data[aa] = data[aa].replace('—', '')
data[aa] = data[aa].replace('"', '')
data[aa] = data[aa].replace('不可持续不可持续', '不可持续')
data[aa] = ''.join(data[aa].split())
#data[aa] = data[aa][:data[aa].find('.', data[aa].find('.') + 1)]
lst.append(data)
5.设置每个表格的表头,并于需要的内容关联起来,使用的是zip,以dict字典的形式。
keyTitle = ['Keyword', 'Amount', 'total_profit', 'reason', 'Persistent']
lsts = []
for da in lst[2::]:
data_dic = zip(keyTitle, da)
lsts.append(dict(data_dic))
print(keyTitle)
6.最后插入数据库
sql = 'insert into business (date,code,name,Keyword,Amount,total_profit,reason,Persistent,unit) values ("{}","{}","{}","{}","{}","{}","{}","{}","{}")'.format(date, code, name, Keyword, Amount, total_profit, reason, Persistent, unit)
cursor.execute(sql)
db.commit()
完整代码:
#excel表格导入数据库
import xlrd
import pymysql
import time
from xlrd import xldate_as_tuple
from datetime import datetime
from time import sleep
import os
import re
str_dic = {
"Keyword": '关键字',
"Amount": '金额',
"total_profit": '占利润总额比例',
'reason': '形成原因说明',
'Persistent': '是否具有可持续性',
}
keyTitle = ['Keyword', 'Amount', 'total_profit', 'reason', 'Persistent']
# 总净资产利润营业收入
if __name__ == '__main__':
date = '2019-12-31'
db = pymysql.connect(host='192.168.0.147', port=3306, user='root', password='123', db='test')
cursor = db.cursor()
filename = r'F:\资料\非主营业务\非主营业务73.xlsx'
rbook = xlrd.open_workbook(filename)
names = rbook.sheet_names()
titles = []
for cod in rbook.sheet_names():
#在编号名称后面加SH,SZ
code = cod[0:6]
if code[0] == '6':
code += '.SH'
elif code[0] in ['3', '0']:
code += '.SZ'
else:
continue
#公司姓名
name = cod[6:]
sheet = rbook.sheet_by_name(cod)
unit = sheet.row_values(0)[0]
if unit == 0:
with open('error.txt', 'a')as A:
A.write('第一个单元格为0:' + cod + '\n')
continue
if unit == '不存在' or unit == '不适用':
print(cod, '不存在####################')
continue
if unit == '单位可疑':
with open('error.txt', 'a')as A:
A.write('单位可疑:' + cod + '\n')
continue
#但第三行第一单元格不为空 和 等于项目的时候跳出
if sheet.row_values(2)[0] != '' and sheet.row_values(2)[0] != '项目':
with open('error.txt', 'a')as A:
A.write('错位:' + cod + '\n')
continue
lst = []
for i in range(sheet.nrows):
data = sheet.row_values(i)
#因为存在空值,把第四行第二单元格为金额拿出来
if i > 2 and sheet.row_values(3)[1] == '金额':
with open('error.txt', 'a')as A:
A.write('第四行异常:' + cod + '\n')
continue
for aa in range(len(data)):
data[aa] = str(data[aa])
data[aa] = data[aa].replace('\n', '')
data[aa] = data[aa].replace(' ', '')
data[aa] = data[aa].replace(',', '')
data[aa] = data[aa].replace('--', '')
data[aa] = data[aa].replace('否否', '否')
data[aa] = data[aa].replace('——', '')
data[aa] = data[aa].replace('—', '')
data[aa] = data[aa].replace('"', '')
data[aa] = data[aa].replace('不可持续不可持续', '不可持续')
data[aa] = ''.join(data[aa].split())
#data[aa] = data[aa][:data[aa].find('.', data[aa].find('.') + 1)]
lst.append(data)
lsts = []
for da in lst[2::]:
data_dic = zip(keyTitle, da)
lsts.append(dict(data_dic))
print(keyTitle)
for i in lsts[1::]:
# print(i)
Keyword = i.get('Keyword')
if not Keyword:
Keyword = ''
Amount = i.get('Amount')
if Keyword[-11:] == '(损失以“-”号填列)':
Keyword = Keyword[:len(Keyword) - 11]
if Keyword[-5:] == '益资产减值':
Keyword = Keyword[:len(Keyword) - 4]
if Keyword[-3:] == '(注)':
Keyword = Keyword[:len(Keyword) - 3]
if not Amount:
Amount = ''
if Amount.count('.') > 1:
with open('error.txt', 'a')as A:
A.write('多个小数点:' + cod + '\n')
continue
if Amount == '-':
Amount = ''
#才用正则,把数字后面文字去掉
#Amount=''.join(re.findall(r"\d+\.?\d*",Amount))
total_profit = i.get('total_profit')
if not total_profit:
total_profit = ''
if total_profit.count('%') > 1:
with open('error.txt', 'a')as A:
A.write('多个百分号:' + cod + '\n')
continue
if total_profit == '-':
total_profit = ''
# 才用正则,把数字后面文字去掉,再加上%,把等于%的符号替换为空
#total_profit = ''.join(re.findall(r"\d+\.?\d*", total_profit))+'%'
#if total_profit == '%':
# total_profit = ''
reason = i.get('reason')
if not reason:
reason = ''
if reason == '-':
reason = ''
Persistent = i.get('Persistent')
if not Persistent:
Persistent = ''
if Persistent == '-':
Persistent = ''
print(Keyword, Amount, total_profit, reason, Persistent)
# db.ping()
sql = 'insert into business (date,code,name,Keyword,Amount,total_profit,reason,Persistent,unit) values ("{}","{}","{}","{}","{}","{}","{}","{}","{}")'.format(
date, code, name, Keyword, Amount, total_profit, reason, Persistent, unit
)
cursor.execute(sql)
db.commit()
db.close()