Python/普康健康发票识别/main.py

306 lines
8.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
"""
普康健康_发票识别
"""
# 加载模块
import json
from pathlib import Path
import shutil
import pandas
import uuid
from concurrent.futures import ThreadPoolExecutor, as_completed
from 普康健康发票查验.main import image_compression
from utils.pandas_extension import traverse_directory, save_as_workbook
from utils.client import restrict, HTTPClient, RequestException, Authenticator
if __name__ == "__main__":
print("正在基于深圳快瞳的增值税发票识别接口批量识别")
match input(
"请选择获取待识别发票影像件方式(1:遍历目录通过影像件路径,其它任意字符:退出脚本):"
):
case "1":
print("正在遍历目录...", end="")
dataframe = traverse_directory(
directory_path="待识别发票", suffixes=[".jpg", ".jpeg", ".png"]
)
# 修改列名相对路径为影像件地址
dataframe.rename(columns={"相对路径": "影像件地址"}, inplace=True)
# 添加索引
dataframe["索引"] = dataframe.apply(
lambda x: uuid.uuid4().hex, axis="columns"
)
case _:
print("选择退出脚本!")
exit(0)
# 统计待识别发票张数
rows = dataframe.shape[0]
# 若待识别发票张数为0则退出脚本
if rows == 0:
print("待识别发票张数为0退出脚本")
exit(0)
print(f"已完成,待识别发票张数为 {rows}")
match input(
"请选择是否就上一次的响应报文进行归档(0:不归档1:归档,其它任意字符:退出脚本):"
):
# 若不归档则不请求深圳快瞳的增值税发票识别接口
case "0":
pass
case "1":
print("正在归档响应报文...", end="")
# 创建响应报文目录路径对象
responses_path = Path("temporary/responses")
# 若响应报文目录路径不存在则创建
if not responses_path.exists():
responses_path.mkdir(parents=True, exist_ok=True)
# 创建归档响应报文目录路径对象
archives_path = Path("temporary/archives")
# 若归档响应报文目录路径不存在则创建
if not archives_path.exists():
archives_path.mkdir(parents=True, exist_ok=True)
# 遍历响应报文目录下所有文件名后缀为JSON的文件路径
for file_path in Path(responses_path).glob("*.json"):
# 若文件路径为文件
if file_path.is_file():
# 移动响应报文由响应报文目录至归档响应报文目录
shutil.move(str(file_path), str(archives_path / file_path.name))
print("已完成")
# 创建深圳快瞳获取访问令牌方法
authenticator = Authenticator(servicer="szkt")
# 初始化请求客户端
http_client = HTTPClient()
@restrict(refill_rate=5, max_tokens=5)
def szkt_request(
image_index,
image_path=None,
):
try:
# 创建影像件路径对象
image_path = Path(image_path)
# 影像件文件名称后缀
image_format = image_path.suffix.strip().lower()
# 根据影像件路径读取图像数据(二进制)
with open(image_path, "rb") as image:
image_data = image.read()
# 标准化影像件格式
# noinspection PyUnboundLocalVariable
image_format = image_format if image_format == ".png" else ".jpeg"
# noinspection PyUnboundLocalVariable
image_data_base64 = image_compression(image_format, image_data)
response = http_client.post(
# 增值税发票识别
url="https://ai.inspirvision.cn/s/api/ocr/vatInvoice",
# 用于和深圳快瞳联查时定位请求
headers={"X-RequestId-Header": image_index},
data={
"token": authenticator.get_token(),
"imgBase64": image_data_base64,
},
)
except RequestException as request_exception:
response = {
"status": request_exception.status_code,
"message": request_exception.message,
}
except Exception as exception:
response = {
"status": "90000",
"message": f"发生其它异常{exception}",
}
return image_index, response
# 用于记录已完成任务数
completed_futures = 0
# 创建线程池
with ThreadPoolExecutor(max_workers=5) as executor:
futures = [
executor.submit(
szkt_request,
image_index=row.索引,
image_path=row.影像件地址,
)
for row in dataframe[["索引", "影像件地址"]].itertuples(
index=False, name="row"
)
]
for future in as_completed(futures):
index, response = future.result()
# 保存报文
with open(
"temporary/responses/{}.json".format(index),
"w",
encoding="utf-8",
) as file:
json.dump(response, file, ensure_ascii=False)
completed_futures += 1
print(f"已完成 {completed_futures / rows * 100:.2f} %")
case _:
print("选择退出脚本!")
exit(0)
print("正在解析报文...", end="")
# 解析后数据体
dataframe_parsed = []
# 遍历报文所在目录
for path_object in list(Path("temporary/responses").glob("*.json")):
# 解析报文结构
parse = {
"索引": "",
"发票号码": "",
"小写金额": "",
"合计金额": "",
"合计税额": "",
"购买方名称": "",
"销售方名称": "",
}
try:
# 打开报文并JSON逆序列化
with open(path_object, "r", encoding="utf-8") as file:
response = json.load(file)
# 索引
parse["索引"] = path_object.stem
# 响应状态码
status_code = response.get("status", "")
# 错误码
code = response.get("code", "")
# 流水号
serial = response.get("serialNo", "")
# 若响应状态码为200且错误码为0则定义为响应成功
if status_code == 200 and code == 0:
for item in response.get("data", []):
# 根据DESC匹配字段
match item.get("desc"):
case "发票号码":
parse["发票号码"] = item.get("value", "")
case "小写金额":
parse["小写金额"] = item.get("value").replace("¥", "")
case "合计金额":
parse["合计金额"] = item.get("value").replace("¥", "")
case "合计税额":
parse["合计税额"] = item.get("value").replace("¥", "")
case "购买方名称":
parse["购买方名称"] = item.get("value", "")
case "销售方名称":
parse["销售方名称"] = item.get("value", "")
else:
raise Exception("解析报文发生其它异常")
except Exception as exception:
parse["发票号码"] = "{}".format(response.get("message"))
finally:
dataframe_parsed.append(parse)
dataframe_parsed = pandas.DataFrame(data=dataframe_parsed, dtype=str)
# 将解析数据集拼接至数据集
dataframe = dataframe.merge(right=dataframe_parsed, how="left", on=["索引"])
# 填补缺失值
dataframe = dataframe.fillna(value="")
print("已完成")
print("正在保存为工作簿...", end="")
save_as_workbook(worksheets=[("Sheet1", dataframe)], workbook_name="results.xlsx")
print("已完成")