一、原理
1、将PDF文档每页转换为图像
2、调用百度通用文本识别页面接口,对图像进行内容识别
3、对图像内容进行对比,并将对比不一致的内容在文档图像上进行标记(红框)
4、将对比结果表格输出为html,以便进行识别
二、范围和限制
1、目前仅支持PDF文档之间的对比
2、无法识别图形(盖章和logo)、不清晰字迹
3、需要联网使用(OCR使用的是百度通用文本识别接口,仅限测试使用,暂不限次数)
4、对比存在误差(原因为百度OCR识别无法达到100%准确)
三、安装库
pip install pymupdf
pip install requests
四、参数
originPDF: PDF文档原件路径
contrastPDF: PDF文档扫描件路径
resultRoot: 输出结果路径(提示:程序运行后会清空该目录,请不要直接设置桌面)
输出 : 标注差异的文档图像、Html文档
五、源码
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
__author__ = '孙思锴'
import os
import shutil
import fitz
import difflib
from datetime import datetime
import base64
from PIL import Image
from PIL import ImageDraw
import requests
from concurrent.futures import ThreadPoolExecutor
session = requests.session()
originDic = {} # 空字典,用于保存原件中每一页对比不一致的文本
contrastDic = {} # 文档扫描件
url = 'https://ai.baidu.com/aidemo' # 百度文本识别接口URL
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36",
}
def initRoot(rootPath):
"""
初始化目录
:param rootPath:
:return:rootPath
"""
rootPath = os.path.abspath(rootPath)
if os.path.exists(rootPath):
# 检查用于放图片的目录是否存在,是的话删除
shutil.rmtree(rootPath) # 清空图片目录
os.makedirs(rootPath) # 创建图片目录
return rootPath
def conver_img(pdfFilepath, outputPath):
"""
pdf转换PNG图片
:param outputPath: PNG图片输出路径
:param pdfFilepath: pdf文档路径
:return: doc.pageCount, ImagePath 文档图像张数,保存地址
"""
pdfFilepath = os.path.abspath(pdfFilepath) # 绝对路径
if not os.path.exists(pdfFilepath):
# 检查文件是否存在
print('文件不存在:', pdfFilepath)
exit(0)
# 获取文件同名目录和类型
pdfName = os.path.basename(pdfFilepath) # 返回文件名
pdfNamePath, extension = os.path.splitext(pdfName)
ImagePath = os.path.join(outputPath, pdfNamePath) # pdf文档图像保存地址
if os.path.exists(ImagePath):
# 检查用于放图片的目录是否存在,是的话删除
shutil.rmtree(ImagePath) # 清空图片目录
os.makedirs(ImagePath) # 创建图片目录
# 读取文件
doc = fitz.open(pdfFilepath)
for page_index in range(doc.pageCount):
page = doc[page_index] # 逐页读取pdf
# 每个尺寸的缩放系数为2,这将为我们生成分辨率提高四倍的图像。
zoom_x = 2.0
zoom_y = 2.0
trans = fitz.Matrix(zoom_x, zoom_y) # .preRotate(0) # .preRotate(rotate)是执行一个旋转
pm = page.getPixmap(matrix=trans, alpha=False)
pm.writePNG(os.path.join(ImagePath, str(page_index) + '.png')) # 保存图片
return doc.pageCount, ImagePath
def getImageInfo(filename):
"""
调用百度接口进行图像内容识别,通用文本识别(高精度含位置版)
1、将image转为base64
2、拼装请求,发送请求
3、检验请求结果,返回
:param filename:图片地址
:return:json
{'errno': 102, 'msg': '请求Demo过于频繁', 'data': ''}
{'errno': 106, 'msg': '文件类型错误', 'data': ''}
{'errno': 0, 'msg': 'success', 'data': {'log_id': '9163508383702196122', 'words_result_num': 30, 'words_result': [{'location': {'width': 142, 'top': 87, 'left': 202, 'height': 41}, 'words': '发银行'}, {'location': {'width': 86, 'top': 106, 'left': 909, 'height': 28}, 'words': '保密协议'}]}}
"""
with open(filename, 'rb') as f:
base64image = base64.b64encode(f.read()).decode()
base64image = 'data:image/png;base64,' + base64image
dic = {
"image": base64image,
"image_url": "",
"type": "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate",
"detect_direction": "false",
}
for _ in range(5):
# 循环5次进行请求,防止请求过程提示请求繁忙
result = session.post(url=url, headers=headers, data=dic).json()
if result['errno'] == 102:
continue
return result
def imageDiff(resultRoot, originFile, contrastFile, page=1):
"""
对比两张照片的区别
:param resultRoot: 输出目录
:param originFile: 源文件
:param contrastFile: 扫描件
:param page: 页数
:return:
"""
# 通过百度接口识别内容
originResult = getImageInfo(filename=originFile) # 识别原件内容
contrastResult = getImageInfo(filename=contrastFile) # 识别扫描件内容
offset = 40 # 设置偏差值,防止原文档图像和扫描版图像出现位置偏差
# 将原件的所有词块,一个个拿去扫描版的里对比,若位置偏差在设置范围内和词性一致,则评定词块相等
for origin_words in originResult['data']['words_result'][:]:
# 获取词块的相关位置信息
left, top = origin_words['location']['left'], origin_words['location']['top']
# right, bottom = left + origin_words['location']['width'], top + origin_words['location']['height']
for contrast_words in contrastResult['data']['words_result'][:]:
# 获取词块的相关位置信息
result_left, result_top = contrast_words['location']['left'], contrast_words['location']['top']
# result_right, result_bottom = result_left + contrast_words['location']['width'], result_top + \
# contrast_words['location']['height']
if abs(top - result_top) < offset:
# 判断词块距离顶部的位置是否在偏差范围内,可理解为两个词块位置是否一致
if origin_words['words'] == contrast_words['words']:
contrastResult['data']['words_result'].remove(contrast_words) # 删除原件词块
originResult['data']['words_result'].remove(origin_words) # 删除原件词块
break # 已找到词块退出循环
elif origin_words['words'] in contrast_words['words']:
# 说明扫描件内容和原件不一样
originResult['data']['words_result'].remove(origin_words) # 删除原件词块
contrast_words['words'] = contrast_words['words'].replace(origin_words['words'], '', 1)
break # 已找到词块退出循环
# 文档图像标注,画框标注出不一样的内容
originImage = Image.open(originFile)
originDraw = ImageDraw.ImageDraw(originImage)
originText = '' # 保存对比不一致的文本
for words in originResult['data']['words_result']:
originText += words['words'] + '\n'
left, top = words['location']['left'], words['location']['top']
right, bottom = left + words['location']['width'], top + words['location']['height']
originDraw.rectangle(((left, top), (right, bottom)), outline='red', width=2)
originDic[page] = originText # 空字典,用于保存原件中每一页对比不一致的文本
contrastImage = Image.open(contrastFile)
contrastDraw = ImageDraw.ImageDraw(contrastImage)
contrastText = ''
for words in contrastResult['data']['words_result']:
# 获取扫描版的每个词块
contrastText += words['words'] + '\n'
left, top = words['location']['left'], words['location']['top']
right, bottom = left + words['location']['width'], top + words['location']['height']
contrastDraw.rectangle(((left, top), (right, bottom)), outline='red', width=2)
contrastDic[page] = contrastText # 文档扫描件
# 图像合并,生成对比图
originSize = originImage.size # 获取原始照片大小
contrastSize = contrastImage.size # 获取扫描件大小
newImage_width = originSize[0] + contrastSize[0]
newImage_hight = originSize[1] if originSize[1] > contrastSize[1] else contrastSize[1]
new_Image = Image.new('RGB', (newImage_width, newImage_hight), "#000000")
new_Image.paste(originImage, (0, 0))
new_Image.paste(contrastImage, (originSize[0], 0))
new_Image.save(os.path.join(resultRoot, "第" + str(page) + '页文档.png'))
if __name__ == '__main__':
startTime = datetime.now()
# 读取要对比的文件
originPDF = r'E:\Workspace\PycharmProjects\Python学习\合同文档比对\测试文档\测试文档-扫描件.pdf' # 文档原件
contrastPDF = r'E:\Workspace\PycharmProjects\Python学习\合同文档比对\测试文档\测试文档-改字.pdf' # 文档扫描件
resultRoot = r'E:\Workspace\PycharmProjects\Python学习\合同文档比对\测试文档\对比结果' # 输出目录
resultRoot = initRoot(resultRoot) # 清空输出目录
originImageNum, originImagePath = conver_img(originPDF, resultRoot) # 将原件pdf文档转换为图像
contrastImageNum, contrastImagePath = conver_img(contrastPDF, resultRoot) # 将扫描件pdf文档转换为图像
if originImageNum != contrastImageNum:
print('文档页数不一致!请查看', resultRoot)
exit(0)
resultRoot = os.path.join(resultRoot, '对比结果') # 创建输出结果目录
os.makedirs(resultRoot) # 创建输出目录
executor = ThreadPoolExecutor() # 开启线程池
for i in range(originImageNum):
originFile = os.path.join(originImagePath, str(i) + '.png')
contrastFile = os.path.join(contrastImagePath, str(i) + '.png')
executor.submit(imageDiff, resultRoot, originFile, contrastFile, i + 1) # 图像对比
executor.shutdown(wait=True) # 等待线程池为空后,关闭线程池
# 输出对比到Html文件
diff = difflib.HtmlDiff()
with open(os.path.join(resultRoot, '结果.html'), 'w', encoding="utf-8") as f:
for i in range(originImageNum):
make_content = diff.make_file(fromlines=originDic[i + 1].splitlines(),
tolines=contrastDic[i + 1].splitlines(),
fromdesc='原件第' + str(i + 1) + '页', todesc='扫描件第' + str(i + 1) + '页')
f.write(make_content)
session.close() # 关闭Session
endTime = datetime.now()
print('文档共', originImageNum, '页,执行总时间:', endTime - startTime)
print('执行成功,请查看输出目录:', resultRoot)
六、执行结果示例: