# -*- coding: utf-8 -*- """ 普康健康_发票识别 """ # 加载模块 import json from pathlib import Path import shutil import pandas import uuid from concurrent.futures import ThreadPoolExecutor, as_completed from 普康健康发票查验.main import image_compression from utils.pandas_extension import traverse_directory, save_as_workbook from utils.client import restrict, HTTPClient, RequestException, Authenticator if __name__ == "__main__": print("正在基于深圳快瞳的增值税发票识别接口批量识别") match input( "请选择获取待识别发票影像件方式(1:遍历目录通过影像件路径,其它任意字符:退出脚本):" ): case "1": print("正在遍历目录...", end="") dataframe = traverse_directory( directory_path="待识别发票", suffixes=[".jpg", ".jpeg", ".png"] ) # 修改列名相对路径为影像件地址 dataframe.rename(columns={"相对路径": "影像件地址"}, inplace=True) # 添加索引 dataframe["索引"] = dataframe.apply( lambda x: uuid.uuid4().hex, axis="columns" ) case _: print("选择退出脚本!") exit(0) # 统计待识别发票张数 rows = dataframe.shape[0] # 若待识别发票张数为0则退出脚本 if rows == 0: print("待识别发票张数为0,退出脚本") exit(0) print(f"已完成,待识别发票张数为 {rows}") match input( "请选择是否就上一次的响应报文进行归档(0:不归档,1:归档,其它任意字符:退出脚本):" ): # 若不归档则不请求深圳快瞳的增值税发票识别接口 case "0": pass case "1": print("正在归档响应报文...", end="") # 创建响应报文目录路径对象 responses_path = Path("temporary/responses") # 若响应报文目录路径不存在则创建 if not responses_path.exists(): responses_path.mkdir(parents=True, exist_ok=True) # 创建归档响应报文目录路径对象 archives_path = Path("temporary/archives") # 若归档响应报文目录路径不存在则创建 if not archives_path.exists(): archives_path.mkdir(parents=True, exist_ok=True) # 遍历响应报文目录下所有文件名后缀为JSON的文件路径 for file_path in Path(responses_path).glob("*.json"): # 若文件路径为文件 if file_path.is_file(): # 移动响应报文由响应报文目录至归档响应报文目录 shutil.move(str(file_path), str(archives_path / file_path.name)) print("已完成") # 创建深圳快瞳获取访问令牌方法 authenticator = Authenticator(servicer="szkt") # 初始化请求客户端 http_client = HTTPClient() @restrict(refill_rate=5, max_tokens=5) def szkt_request( image_index, image_path=None, ): try: # 创建影像件路径对象 image_path = Path(image_path) # 影像件文件名称后缀 image_format = image_path.suffix.strip().lower() # 根据影像件路径读取图像数据(二进制) with open(image_path, "rb") as image: image_data = image.read() # 标准化影像件格式 # noinspection PyUnboundLocalVariable image_format = image_format if image_format == ".png" else ".jpeg" # noinspection PyUnboundLocalVariable image_data_base64 = image_compression(image_format, image_data) response = http_client.post( # 增值税发票识别 url="https://ai.inspirvision.cn/s/api/ocr/vatInvoice", # 用于和深圳快瞳联查时定位请求 headers={"X-RequestId-Header": image_index}, data={ "token": authenticator.get_token(), "imgBase64": image_data_base64, }, ) except RequestException as request_exception: response = { "status": request_exception.status_code, "message": request_exception.message, } except Exception as exception: response = { "status": "90000", "message": f"发生其它异常{exception}", } return image_index, response # 用于记录已完成任务数 completed_futures = 0 # 创建线程池 with ThreadPoolExecutor(max_workers=5) as executor: futures = [ executor.submit( szkt_request, image_index=row.索引, image_path=row.影像件地址, ) for row in dataframe[["索引", "影像件地址"]].itertuples( index=False, name="row" ) ] for future in as_completed(futures): index, response = future.result() # 保存报文 with open( "temporary/responses/{}.json".format(index), "w", encoding="utf-8", ) as file: json.dump(response, file, ensure_ascii=False) completed_futures += 1 print(f"已完成 {completed_futures / rows * 100:.2f} %") case _: print("选择退出脚本!") exit(0) print("正在解析报文...", end="") # 解析后数据体 dataframe_parsed = [] # 遍历报文所在目录 for path_object in list(Path("temporary/responses").glob("*.json")): # 解析报文结构 parse = { "索引": "", "发票号码": "", "小写金额": "", "合计金额": "", "合计税额": "", "购买方名称": "", "销售方名称": "", } try: # 打开报文并JSON逆序列化 with open(path_object, "r", encoding="utf-8") as file: response = json.load(file) # 索引 parse["索引"] = path_object.stem # 响应状态码 status_code = response.get("status", "") # 错误码 code = response.get("code", "") # 流水号 serial = response.get("serialNo", "") # 若响应状态码为200且错误码为0,则定义为响应成功 if status_code == 200 and code == 0: for item in response.get("data", []): # 根据DESC匹配字段 match item.get("desc"): case "发票号码": parse["发票号码"] = item.get("value", "") case "小写金额": parse["小写金额"] = item.get("value").replace("¥", "") case "合计金额": parse["合计金额"] = item.get("value").replace("¥", "") case "合计税额": parse["合计税额"] = item.get("value").replace("¥", "") case "购买方名称": parse["购买方名称"] = item.get("value", "") case "销售方名称": parse["销售方名称"] = item.get("value", "") else: raise Exception("解析报文发生其它异常") except Exception as exception: parse["发票号码"] = "{}".format(response.get("message")) finally: dataframe_parsed.append(parse) dataframe_parsed = pandas.DataFrame(data=dataframe_parsed, dtype=str) # 将解析数据集拼接至数据集 dataframe = dataframe.merge(right=dataframe_parsed, how="left", on=["索引"]) # 填补缺失值 dataframe = dataframe.fillna(value="") print("已完成") print("正在保存为工作簿...", end="") save_as_workbook(worksheets=[("Sheet1", dataframe)], workbook_name="results.xlsx") print("已完成")