将excel数据导入数据库

需求:将文件夹低下的excel表格导入到数据库（大概70个文件夹，每个excel有30多个sheet）
excel格式如下

非主营业务.png

1.使用xlrd模块，读取excel文件

import xlrd
import pymysql
filename = r'F:\资料\非主营业务\非主营业务73.xlsx'
rbook = xlrd.open_workbook(filename)
names = rbook.sheet_names()

2.提取需要的字段(code,name)，code的第一位为6，在后面加SH(上海);code的第一位为0或者3，在后面加SZ(深圳)

#股票编码
code = cod[0:6]
 if code[0] == '6':
      code += '.SH'
elif code[0] in ['3', '0']:
        code += '.SZ'
else:
        continue
#公司姓名
name = cod[6:]

3.读取每一个表，将报错的sheet名字，类型写入txt文件

sheet = rbook.sheet_by_name(cod)
unit = sheet.row_values(0)[0]
        if unit == 0:
            with open('error.txt', 'a')as A:
                A.write('第一个单元格为0:' + cod + '\n')
            continue
        if unit == '不存在' or unit == '不适用':
            print(cod, '不存在####################')
            continue
        if unit == '单位可疑':
            with open('error.txt', 'a')as A:
                A.write('单位可疑:' + cod + '\n')
            continue
        #但第三行第一单元格不为空 和 等于项目的时候跳出
        if sheet.row_values(2)[0] != '' and sheet.row_values(2)[0] != '项目':
            with open('error.txt', 'a')as A:
                A.write('错位:' + cod + '\n')
            continue

4.获取每个表格行数sheet.nrows，并对表格的内容进行清洗

for i in range(sheet.nrows):
     data = sheet.row_values(i)
     #因为存在空值，把第四行第二单元格为金额拿出来
     if i > 2 and sheet.row_values(3)[1] == '金额':
          with open('error.txt', 'a')as A:
                A.write('第四行异常:' + cod + '\n')
                continue
      for aa in range(len(data)):
                data[aa] = str(data[aa])
                data[aa] = data[aa].replace('\n', '')
                data[aa] = data[aa].replace(' ', '')
                data[aa] = data[aa].replace(',', '')
                data[aa] = data[aa].replace('--', '')
                data[aa] = data[aa].replace('否否', '否')
                data[aa] = data[aa].replace('——', '')
                data[aa] = data[aa].replace('—', '')
                data[aa] = data[aa].replace('"', '')
                data[aa] = data[aa].replace('不可持续不可持续', '不可持续')
                data[aa] = ''.join(data[aa].split())
                #data[aa] = data[aa][:data[aa].find('.', data[aa].find('.') + 1)]
        lst.append(data)

5.设置每个表格的表头，并于需要的内容关联起来，使用的是zip，以dict字典的形式。

keyTitle = ['Keyword', 'Amount', 'total_profit', 'reason', 'Persistent']
lsts = []
for da in lst[2::]:
       data_dic = zip(keyTitle, da)
        lsts.append(dict(data_dic))
print(keyTitle)

6.最后插入数据库

sql = 'insert into business (date,code,name,Keyword,Amount,total_profit,reason,Persistent,unit) values ("{}","{}","{}","{}","{}","{}","{}","{}","{}")'.format(date, code, name, Keyword, Amount, total_profit, reason, Persistent, unit)
cursor.execute(sql)
db.commit()

完整代码：


#excel表格导入数据库
import xlrd
import pymysql
import time
from xlrd import xldate_as_tuple
from datetime import datetime
from time import sleep
import os
import re


str_dic = {
    "Keyword": '关键字',
    "Amount": '金额',
    "total_profit": '占利润总额比例',
    'reason': '形成原因说明',
    'Persistent': '是否具有可持续性',
}
keyTitle = ['Keyword', 'Amount', 'total_profit', 'reason', 'Persistent']
# 总净资产利润营业收入
if __name__ == '__main__':
    date = '2019-12-31'
    db = pymysql.connect(host='192.168.0.147', port=3306, user='root', password='123', db='test')
    cursor = db.cursor()
    filename = r'F:\资料\非主营业务\非主营业务73.xlsx'
    rbook = xlrd.open_workbook(filename)
    names = rbook.sheet_names()
    titles = []
    for cod in rbook.sheet_names():
        #在编号名称后面加SH,SZ
        code = cod[0:6]
        if code[0] == '6':
            code += '.SH'
        elif code[0] in ['3', '0']:
            code += '.SZ'
        else:
            continue
        #公司姓名
        name = cod[6:]

        sheet = rbook.sheet_by_name(cod)
        unit = sheet.row_values(0)[0]
        if unit == 0:
            with open('error.txt', 'a')as A:
                A.write('第一个单元格为0:' + cod + '\n')
            continue
        if unit == '不存在' or unit == '不适用':
            print(cod, '不存在####################')
            continue
        if unit == '单位可疑':
            with open('error.txt', 'a')as A:
                A.write('单位可疑:' + cod + '\n')
            continue
        #但第三行第一单元格不为空 和 等于项目的时候跳出
        if sheet.row_values(2)[0] != '' and sheet.row_values(2)[0] != '项目':
            with open('error.txt', 'a')as A:
                A.write('错位:' + cod + '\n')
            continue

        lst = []
        for i in range(sheet.nrows):
            data = sheet.row_values(i)
            #因为存在空值，把第四行第二单元格为金额拿出来
            if i > 2 and sheet.row_values(3)[1] == '金额':
                with open('error.txt', 'a')as A:
                    A.write('第四行异常:' + cod + '\n')
                continue
            for aa in range(len(data)):
                data[aa] = str(data[aa])
                data[aa] = data[aa].replace('\n', '')
                data[aa] = data[aa].replace(' ', '')
                data[aa] = data[aa].replace(',', '')
                data[aa] = data[aa].replace('--', '')
                data[aa] = data[aa].replace('否否', '否')
                data[aa] = data[aa].replace('——', '')
                data[aa] = data[aa].replace('—', '')
                data[aa] = data[aa].replace('"', '')
                data[aa] = data[aa].replace('不可持续不可持续', '不可持续')
                data[aa] = ''.join(data[aa].split())
                #data[aa] = data[aa][:data[aa].find('.', data[aa].find('.') + 1)]


            lst.append(data)


        lsts = []
        for da in lst[2::]:
            data_dic = zip(keyTitle, da)
            lsts.append(dict(data_dic))
        print(keyTitle)

        for i in lsts[1::]:
            # print(i)
            Keyword = i.get('Keyword')
            if not Keyword:
                Keyword = ''
            Amount = i.get('Amount')
            if Keyword[-11:] == '（损失以“-”号填列）':
                Keyword = Keyword[:len(Keyword) - 11]
            if Keyword[-5:] == '益资产减值':
                Keyword = Keyword[:len(Keyword) - 4]
            if Keyword[-3:] == '（注）':
                Keyword = Keyword[:len(Keyword) - 3]
            if not Amount:
                Amount = ''
            if Amount.count('.') > 1:
                with open('error.txt', 'a')as A:
                    A.write('多个小数点:' + cod + '\n')
                continue
            if  Amount == '-':
                Amount = ''
            #才用正则，把数字后面文字去掉
            #Amount=''.join(re.findall(r"\d+\.?\d*",Amount))
            total_profit = i.get('total_profit')
            if not total_profit:
                total_profit = ''
            if total_profit.count('%') > 1:
                with open('error.txt', 'a')as A:
                    A.write('多个百分号:' + cod + '\n')
                continue
            if  total_profit == '-':
                total_profit = ''
            # 才用正则，把数字后面文字去掉，再加上%，把等于%的符号替换为空
            #total_profit = ''.join(re.findall(r"\d+\.?\d*", total_profit))+'%'
            #if total_profit == '%':
            #    total_profit = ''
            reason = i.get('reason')
            if not reason:
                reason = ''
            if  reason == '-':
                reason = ''
            Persistent = i.get('Persistent')
            if not Persistent:
                Persistent = ''
            if  Persistent == '-':
                Persistent = ''
            print(Keyword, Amount, total_profit, reason, Persistent)
            # db.ping()
            sql = 'insert into business (date,code,name,Keyword,Amount,total_profit,reason,Persistent,unit) values ("{}","{}","{}","{}","{}","{}","{}","{}","{}")'.format(
                date, code, name, Keyword, Amount, total_profit, reason, Persistent, unit
            )
            cursor.execute(sql)
            db.commit()
    db.close()