306 lines
		
	
	
		
			8.8 KiB
		
	
	
	
		
			Python
		
	
	
	
			
		
		
	
	
			306 lines
		
	
	
		
			8.8 KiB
		
	
	
	
		
			Python
		
	
	
	
| # -*- coding: utf-8 -*-
 | ||
| 
 | ||
| """
 | ||
| 普康健康_发票识别
 | ||
| """
 | ||
| 
 | ||
| # 加载模块
 | ||
| 
 | ||
| import json
 | ||
| 
 | ||
| from pathlib import Path
 | ||
| 
 | ||
| import shutil
 | ||
| 
 | ||
| import pandas
 | ||
| 
 | ||
| import uuid
 | ||
| 
 | ||
| from concurrent.futures import ThreadPoolExecutor, as_completed
 | ||
| 
 | ||
| from 普康健康发票查验.main import image_compression
 | ||
| 
 | ||
| from utils.pandas_extension import traverse_directory, save_as_workbook
 | ||
| 
 | ||
| from utils.client import restrict, HTTPClient, RequestException, Authenticator
 | ||
| 
 | ||
| 
 | ||
| if __name__ == "__main__":
 | ||
| 
 | ||
|     print("正在基于深圳快瞳的增值税发票识别接口批量识别")
 | ||
| 
 | ||
|     match input(
 | ||
|         "请选择获取待识别发票影像件方式(1:遍历目录通过影像件路径,其它任意字符:退出脚本):"
 | ||
|     ):
 | ||
| 
 | ||
|         case "1":
 | ||
| 
 | ||
|             print("正在遍历目录...", end="")
 | ||
| 
 | ||
|             dataframe = traverse_directory(
 | ||
|                 directory_path="待识别发票", suffixes=[".jpg", ".jpeg", ".png"]
 | ||
|             )
 | ||
| 
 | ||
|             # 修改列名相对路径为影像件地址
 | ||
|             dataframe.rename(columns={"相对路径": "影像件地址"}, inplace=True)
 | ||
| 
 | ||
|             # 添加索引
 | ||
|             dataframe["索引"] = dataframe.apply(
 | ||
|                 lambda x: uuid.uuid4().hex, axis="columns"
 | ||
|             )
 | ||
| 
 | ||
|         case _:
 | ||
| 
 | ||
|             print("选择退出脚本!")
 | ||
| 
 | ||
|             exit(0)
 | ||
| 
 | ||
|     # 统计待识别发票张数
 | ||
|     rows = dataframe.shape[0]
 | ||
| 
 | ||
|     # 若待识别发票张数为0则退出脚本
 | ||
|     if rows == 0:
 | ||
| 
 | ||
|         print("待识别发票张数为0,退出脚本")
 | ||
| 
 | ||
|         exit(0)
 | ||
| 
 | ||
|     print(f"已完成,待识别发票张数为 {rows}")
 | ||
| 
 | ||
|     match input(
 | ||
|         "请选择是否就上一次的响应报文进行归档(0:不归档,1:归档,其它任意字符:退出脚本):"
 | ||
|     ):
 | ||
| 
 | ||
|         # 若不归档则不请求深圳快瞳的增值税发票识别接口
 | ||
|         case "0":
 | ||
| 
 | ||
|             pass
 | ||
| 
 | ||
|         case "1":
 | ||
| 
 | ||
|             print("正在归档响应报文...", end="")
 | ||
| 
 | ||
|             # 创建响应报文目录路径对象
 | ||
|             responses_path = Path("temporary/responses")
 | ||
| 
 | ||
|             # 若响应报文目录路径不存在则创建
 | ||
|             if not responses_path.exists():
 | ||
| 
 | ||
|                 responses_path.mkdir(parents=True, exist_ok=True)
 | ||
| 
 | ||
|             # 创建归档响应报文目录路径对象
 | ||
|             archives_path = Path("temporary/archives")
 | ||
| 
 | ||
|             # 若归档响应报文目录路径不存在则创建
 | ||
|             if not archives_path.exists():
 | ||
| 
 | ||
|                 archives_path.mkdir(parents=True, exist_ok=True)
 | ||
| 
 | ||
|             # 遍历响应报文目录下所有文件名后缀为JSON的文件路径
 | ||
|             for file_path in Path(responses_path).glob("*.json"):
 | ||
| 
 | ||
|                 # 若文件路径为文件
 | ||
|                 if file_path.is_file():
 | ||
| 
 | ||
|                     # 移动响应报文由响应报文目录至归档响应报文目录
 | ||
|                     shutil.move(str(file_path), str(archives_path / file_path.name))
 | ||
| 
 | ||
|             print("已完成")
 | ||
| 
 | ||
|             # 创建深圳快瞳获取访问令牌方法
 | ||
|             authenticator = Authenticator(servicer="szkt")
 | ||
| 
 | ||
|             # 初始化请求客户端
 | ||
|             http_client = HTTPClient()
 | ||
| 
 | ||
|             @restrict(refill_rate=5, max_tokens=5)
 | ||
|             def szkt_request(
 | ||
|                 image_index,
 | ||
|                 image_path=None,
 | ||
|             ):
 | ||
| 
 | ||
|                 try:
 | ||
| 
 | ||
|                     # 创建影像件路径对象
 | ||
|                     image_path = Path(image_path)
 | ||
| 
 | ||
|                     # 影像件文件名称后缀
 | ||
|                     image_format = image_path.suffix.strip().lower()
 | ||
| 
 | ||
|                     # 根据影像件路径读取图像数据(二进制)
 | ||
|                     with open(image_path, "rb") as image:
 | ||
| 
 | ||
|                         image_data = image.read()
 | ||
| 
 | ||
|                     # 标准化影像件格式
 | ||
|                     # noinspection PyUnboundLocalVariable
 | ||
|                     image_format = image_format if image_format == ".png" else ".jpeg"
 | ||
| 
 | ||
|                     # noinspection PyUnboundLocalVariable
 | ||
|                     image_data_base64 = image_compression(image_format, image_data)
 | ||
| 
 | ||
|                     response = http_client.post(
 | ||
|                         # 增值税发票识别
 | ||
|                         url="https://ai.inspirvision.cn/s/api/ocr/vatInvoice",
 | ||
|                         # 用于和深圳快瞳联查时定位请求
 | ||
|                         headers={"X-RequestId-Header": image_index},
 | ||
|                         data={
 | ||
|                             "token": authenticator.get_token(),
 | ||
|                             "imgBase64": image_data_base64,
 | ||
|                         },
 | ||
|                     )
 | ||
| 
 | ||
|                 except RequestException as request_exception:
 | ||
| 
 | ||
|                     response = {
 | ||
|                         "status": request_exception.status_code,
 | ||
|                         "message": request_exception.message,
 | ||
|                     }
 | ||
| 
 | ||
|                 except Exception as exception:
 | ||
| 
 | ||
|                     response = {
 | ||
|                         "status": "90000",
 | ||
|                         "message": f"发生其它异常{exception}",
 | ||
|                     }
 | ||
| 
 | ||
|                 return image_index, response
 | ||
| 
 | ||
|             # 用于记录已完成任务数
 | ||
|             completed_futures = 0
 | ||
| 
 | ||
|             # 创建线程池
 | ||
|             with ThreadPoolExecutor(max_workers=5) as executor:
 | ||
| 
 | ||
|                 futures = [
 | ||
|                     executor.submit(
 | ||
|                         szkt_request,
 | ||
|                         image_index=row.索引,
 | ||
|                         image_path=row.影像件地址,
 | ||
|                     )
 | ||
|                     for row in dataframe[["索引", "影像件地址"]].itertuples(
 | ||
|                         index=False, name="row"
 | ||
|                     )
 | ||
|                 ]
 | ||
| 
 | ||
|                 for future in as_completed(futures):
 | ||
| 
 | ||
|                     index, response = future.result()
 | ||
| 
 | ||
|                     # 保存报文
 | ||
|                     with open(
 | ||
|                         "temporary/responses/{}.json".format(index),
 | ||
|                         "w",
 | ||
|                         encoding="utf-8",
 | ||
|                     ) as file:
 | ||
| 
 | ||
|                         json.dump(response, file, ensure_ascii=False)
 | ||
| 
 | ||
|                     completed_futures += 1
 | ||
| 
 | ||
|                     print(f"已完成 {completed_futures / rows * 100:.2f} %")
 | ||
| 
 | ||
|         case _:
 | ||
| 
 | ||
|             print("选择退出脚本!")
 | ||
| 
 | ||
|             exit(0)
 | ||
| 
 | ||
|     print("正在解析报文...", end="")
 | ||
| 
 | ||
|     # 解析后数据体
 | ||
|     dataframe_parsed = []
 | ||
| 
 | ||
|     # 遍历报文所在目录
 | ||
|     for path_object in list(Path("temporary/responses").glob("*.json")):
 | ||
| 
 | ||
|         # 解析报文结构
 | ||
|         parse = {
 | ||
|             "索引": "",
 | ||
|             "发票号码": "",
 | ||
|             "小写金额": "",
 | ||
|             "合计金额": "",
 | ||
|             "合计税额": "",
 | ||
|             "购买方名称": "",
 | ||
|             "销售方名称": "",
 | ||
|         }
 | ||
| 
 | ||
|         try:
 | ||
| 
 | ||
|             # 打开报文并JSON逆序列化
 | ||
|             with open(path_object, "r", encoding="utf-8") as file:
 | ||
| 
 | ||
|                 response = json.load(file)
 | ||
| 
 | ||
|             # 索引
 | ||
|             parse["索引"] = path_object.stem
 | ||
| 
 | ||
|             # 响应状态码
 | ||
|             status_code = response.get("status", "")
 | ||
| 
 | ||
|             # 错误码
 | ||
|             code = response.get("code", "")
 | ||
| 
 | ||
|             # 流水号
 | ||
|             serial = response.get("serialNo", "")
 | ||
| 
 | ||
|             # 若响应状态码为200且错误码为0,则定义为响应成功
 | ||
|             if status_code == 200 and code == 0:
 | ||
| 
 | ||
|                 for item in response.get("data", []):
 | ||
| 
 | ||
|                     # 根据DESC匹配字段
 | ||
|                     match item.get("desc"):
 | ||
| 
 | ||
|                         case "发票号码":
 | ||
| 
 | ||
|                             parse["发票号码"] = item.get("value", "")
 | ||
| 
 | ||
|                         case "小写金额":
 | ||
| 
 | ||
|                             parse["小写金额"] = item.get("value").replace("¥", "")
 | ||
| 
 | ||
|                         case "合计金额":
 | ||
| 
 | ||
|                             parse["合计金额"] = item.get("value").replace("¥", "")
 | ||
| 
 | ||
|                         case "合计税额":
 | ||
| 
 | ||
|                             parse["合计税额"] = item.get("value").replace("¥", "")
 | ||
| 
 | ||
|                         case "购买方名称":
 | ||
| 
 | ||
|                             parse["购买方名称"] = item.get("value", "")
 | ||
| 
 | ||
|                         case "销售方名称":
 | ||
| 
 | ||
|                             parse["销售方名称"] = item.get("value", "")
 | ||
| 
 | ||
|             else:
 | ||
| 
 | ||
|                 raise Exception("解析报文发生其它异常")
 | ||
| 
 | ||
|         except Exception as exception:
 | ||
| 
 | ||
|             parse["发票号码"] = "{}".format(response.get("message"))
 | ||
| 
 | ||
|         finally:
 | ||
| 
 | ||
|             dataframe_parsed.append(parse)
 | ||
| 
 | ||
|     dataframe_parsed = pandas.DataFrame(data=dataframe_parsed, dtype=str)
 | ||
| 
 | ||
|     # 将解析数据集拼接至数据集
 | ||
|     dataframe = dataframe.merge(right=dataframe_parsed, how="left", on=["索引"])
 | ||
| 
 | ||
|     # 填补缺失值
 | ||
|     dataframe = dataframe.fillna(value="")
 | ||
| 
 | ||
|     print("已完成")
 | ||
| 
 | ||
|     print("正在保存为工作簿...", end="")
 | ||
| 
 | ||
|     save_as_workbook(worksheets=[("Sheet1", dataframe)], workbook_name="results.xlsx")
 | ||
| 
 | ||
|     print("已完成")
 |