306 lines
8.8 KiB
Python
306 lines
8.8 KiB
Python
# -*- coding: utf-8 -*-
|
||
|
||
"""
|
||
普康健康_发票识别
|
||
"""
|
||
|
||
# 加载模块
|
||
|
||
import json
|
||
|
||
from pathlib import Path
|
||
|
||
import shutil
|
||
|
||
import pandas
|
||
|
||
import uuid
|
||
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
|
||
from 普康健康发票查验.main import image_compression
|
||
|
||
from utils.pandas_extension import traverse_directory, save_as_workbook
|
||
|
||
from utils.client import restrict, HTTPClient, RequestException, Authenticator
|
||
|
||
|
||
if __name__ == "__main__":
|
||
|
||
print("正在基于深圳快瞳的增值税发票识别接口批量识别")
|
||
|
||
match input(
|
||
"请选择获取待识别发票影像件方式(1:遍历目录通过影像件路径,其它任意字符:退出脚本):"
|
||
):
|
||
|
||
case "1":
|
||
|
||
print("正在遍历目录...", end="")
|
||
|
||
dataframe = traverse_directory(
|
||
directory_path="待识别发票", suffixes=[".jpg", ".jpeg", ".png"]
|
||
)
|
||
|
||
# 修改列名相对路径为影像件地址
|
||
dataframe.rename(columns={"相对路径": "影像件地址"}, inplace=True)
|
||
|
||
# 添加索引
|
||
dataframe["索引"] = dataframe.apply(
|
||
lambda x: uuid.uuid4().hex, axis="columns"
|
||
)
|
||
|
||
case _:
|
||
|
||
print("选择退出脚本!")
|
||
|
||
exit(0)
|
||
|
||
# 统计待识别发票张数
|
||
rows = dataframe.shape[0]
|
||
|
||
# 若待识别发票张数为0则退出脚本
|
||
if rows == 0:
|
||
|
||
print("待识别发票张数为0,退出脚本")
|
||
|
||
exit(0)
|
||
|
||
print(f"已完成,待识别发票张数为 {rows}")
|
||
|
||
match input(
|
||
"请选择是否就上一次的响应报文进行归档(0:不归档,1:归档,其它任意字符:退出脚本):"
|
||
):
|
||
|
||
# 若不归档则不请求深圳快瞳的增值税发票识别接口
|
||
case "0":
|
||
|
||
pass
|
||
|
||
case "1":
|
||
|
||
print("正在归档响应报文...", end="")
|
||
|
||
# 创建响应报文目录路径对象
|
||
responses_path = Path("temporary/responses")
|
||
|
||
# 若响应报文目录路径不存在则创建
|
||
if not responses_path.exists():
|
||
|
||
responses_path.mkdir(parents=True, exist_ok=True)
|
||
|
||
# 创建归档响应报文目录路径对象
|
||
archives_path = Path("temporary/archives")
|
||
|
||
# 若归档响应报文目录路径不存在则创建
|
||
if not archives_path.exists():
|
||
|
||
archives_path.mkdir(parents=True, exist_ok=True)
|
||
|
||
# 遍历响应报文目录下所有文件名后缀为JSON的文件路径
|
||
for file_path in Path(responses_path).glob("*.json"):
|
||
|
||
# 若文件路径为文件
|
||
if file_path.is_file():
|
||
|
||
# 移动响应报文由响应报文目录至归档响应报文目录
|
||
shutil.move(str(file_path), str(archives_path / file_path.name))
|
||
|
||
print("已完成")
|
||
|
||
# 创建深圳快瞳获取访问令牌方法
|
||
authenticator = Authenticator(servicer="szkt")
|
||
|
||
# 初始化请求客户端
|
||
http_client = HTTPClient()
|
||
|
||
@restrict(refill_rate=5, max_tokens=5)
|
||
def szkt_request(
|
||
image_index,
|
||
image_path=None,
|
||
):
|
||
|
||
try:
|
||
|
||
# 创建影像件路径对象
|
||
image_path = Path(image_path)
|
||
|
||
# 影像件文件名称后缀
|
||
image_format = image_path.suffix.strip().lower()
|
||
|
||
# 根据影像件路径读取图像数据(二进制)
|
||
with open(image_path, "rb") as image:
|
||
|
||
image_data = image.read()
|
||
|
||
# 标准化影像件格式
|
||
# noinspection PyUnboundLocalVariable
|
||
image_format = image_format if image_format == ".png" else ".jpeg"
|
||
|
||
# noinspection PyUnboundLocalVariable
|
||
image_data_base64 = image_compression(image_format, image_data)
|
||
|
||
response = http_client.post(
|
||
# 增值税发票识别
|
||
url="https://ai.inspirvision.cn/s/api/ocr/vatInvoice",
|
||
# 用于和深圳快瞳联查时定位请求
|
||
headers={"X-RequestId-Header": image_index},
|
||
data={
|
||
"token": authenticator.get_token(),
|
||
"imgBase64": image_data_base64,
|
||
},
|
||
)
|
||
|
||
except RequestException as request_exception:
|
||
|
||
response = {
|
||
"status": request_exception.status_code,
|
||
"message": request_exception.message,
|
||
}
|
||
|
||
except Exception as exception:
|
||
|
||
response = {
|
||
"status": "90000",
|
||
"message": f"发生其它异常{exception}",
|
||
}
|
||
|
||
return image_index, response
|
||
|
||
# 用于记录已完成任务数
|
||
completed_futures = 0
|
||
|
||
# 创建线程池
|
||
with ThreadPoolExecutor(max_workers=5) as executor:
|
||
|
||
futures = [
|
||
executor.submit(
|
||
szkt_request,
|
||
image_index=row.索引,
|
||
image_path=row.影像件地址,
|
||
)
|
||
for row in dataframe[["索引", "影像件地址"]].itertuples(
|
||
index=False, name="row"
|
||
)
|
||
]
|
||
|
||
for future in as_completed(futures):
|
||
|
||
index, response = future.result()
|
||
|
||
# 保存报文
|
||
with open(
|
||
"temporary/responses/{}.json".format(index),
|
||
"w",
|
||
encoding="utf-8",
|
||
) as file:
|
||
|
||
json.dump(response, file, ensure_ascii=False)
|
||
|
||
completed_futures += 1
|
||
|
||
print(f"已完成 {completed_futures / rows * 100:.2f} %")
|
||
|
||
case _:
|
||
|
||
print("选择退出脚本!")
|
||
|
||
exit(0)
|
||
|
||
print("正在解析报文...", end="")
|
||
|
||
# 解析后数据体
|
||
dataframe_parsed = []
|
||
|
||
# 遍历报文所在目录
|
||
for path_object in list(Path("temporary/responses").glob("*.json")):
|
||
|
||
# 解析报文结构
|
||
parse = {
|
||
"索引": "",
|
||
"发票号码": "",
|
||
"小写金额": "",
|
||
"合计金额": "",
|
||
"合计税额": "",
|
||
"购买方名称": "",
|
||
"销售方名称": "",
|
||
}
|
||
|
||
try:
|
||
|
||
# 打开报文并JSON逆序列化
|
||
with open(path_object, "r", encoding="utf-8") as file:
|
||
|
||
response = json.load(file)
|
||
|
||
# 索引
|
||
parse["索引"] = path_object.stem
|
||
|
||
# 响应状态码
|
||
status_code = response.get("status", "")
|
||
|
||
# 错误码
|
||
code = response.get("code", "")
|
||
|
||
# 流水号
|
||
serial = response.get("serialNo", "")
|
||
|
||
# 若响应状态码为200且错误码为0,则定义为响应成功
|
||
if status_code == 200 and code == 0:
|
||
|
||
for item in response.get("data", []):
|
||
|
||
# 根据DESC匹配字段
|
||
match item.get("desc"):
|
||
|
||
case "发票号码":
|
||
|
||
parse["发票号码"] = item.get("value", "")
|
||
|
||
case "小写金额":
|
||
|
||
parse["小写金额"] = item.get("value").replace("¥", "")
|
||
|
||
case "合计金额":
|
||
|
||
parse["合计金额"] = item.get("value").replace("¥", "")
|
||
|
||
case "合计税额":
|
||
|
||
parse["合计税额"] = item.get("value").replace("¥", "")
|
||
|
||
case "购买方名称":
|
||
|
||
parse["购买方名称"] = item.get("value", "")
|
||
|
||
case "销售方名称":
|
||
|
||
parse["销售方名称"] = item.get("value", "")
|
||
|
||
else:
|
||
|
||
raise Exception("解析报文发生其它异常")
|
||
|
||
except Exception as exception:
|
||
|
||
parse["发票号码"] = "{}".format(response.get("message"))
|
||
|
||
finally:
|
||
|
||
dataframe_parsed.append(parse)
|
||
|
||
dataframe_parsed = pandas.DataFrame(data=dataframe_parsed, dtype=str)
|
||
|
||
# 将解析数据集拼接至数据集
|
||
dataframe = dataframe.merge(right=dataframe_parsed, how="left", on=["索引"])
|
||
|
||
# 填补缺失值
|
||
dataframe = dataframe.fillna(value="")
|
||
|
||
print("已完成")
|
||
|
||
print("正在保存为工作簿...", end="")
|
||
|
||
save_as_workbook(worksheets=[("Sheet1", dataframe)], workbook_name="results.xlsx")
|
||
|
||
print("已完成")
|