Python/普康健康发票识别/main.py

# -*- coding: utf-8 -*-

"""
普康健康_发票识别
"""

# 加载模块

import json

from pathlib import Path

import shutil

import pandas

import uuid

from concurrent.futures import ThreadPoolExecutor, as_completed

from 普康健康发票查验.main import image_compression

from utils.pandas_extension import traverse_directory, save_as_workbook

from utils.client import restrict, HTTPClient, RequestException, Authenticator


if __name__ == "__main__":

    print("正在基于深圳快瞳的增值税发票识别接口批量识别")

    match input(
        "请选择获取待识别发票影像件方式(1:遍历目录通过影像件路径，其它任意字符:退出脚本):"
    ):

        case "1":

            print("正在遍历目录...", end="")

            dataframe = traverse_directory(
                directory_path="待识别发票", suffixes=[".jpg", ".jpeg", ".png"]
            )

            # 修改列名相对路径为影像件地址
            dataframe.rename(columns={"相对路径": "影像件地址"}, inplace=True)

            # 添加索引
            dataframe["索引"] = dataframe.apply(
                lambda x: uuid.uuid4().hex, axis="columns"
            )

        case _:

            print("选择退出脚本!")

            exit(0)

    # 统计待识别发票张数
    rows = dataframe.shape[0]

    # 若待识别发票张数为0则退出脚本
    if rows == 0:

        print("待识别发票张数为0，退出脚本")

        exit(0)

    print(f"已完成，待识别发票张数为 {rows}")

    match input(
        "请选择是否就上一次的响应报文进行归档(0:不归档，1:归档，其它任意字符:退出脚本):"
    ):

        # 若不归档则不请求深圳快瞳的增值税发票识别接口
        case "0":

            pass

        case "1":

            print("正在归档响应报文...", end="")

            # 创建响应报文目录路径对象
            responses_path = Path("temporary/responses")

            # 若响应报文目录路径不存在则创建
            if not responses_path.exists():

                responses_path.mkdir(parents=True, exist_ok=True)

            # 创建归档响应报文目录路径对象
            archives_path = Path("temporary/archives")

            # 若归档响应报文目录路径不存在则创建
            if not archives_path.exists():

                archives_path.mkdir(parents=True, exist_ok=True)

            # 遍历响应报文目录下所有文件名后缀为JSON的文件路径
            for file_path in Path(responses_path).glob("*.json"):

                # 若文件路径为文件
                if file_path.is_file():

                    # 移动响应报文由响应报文目录至归档响应报文目录
                    shutil.move(str(file_path), str(archives_path / file_path.name))

            print("已完成")

            # 创建深圳快瞳获取访问令牌方法
            authenticator = Authenticator(servicer="szkt")

            # 初始化请求客户端
            http_client = HTTPClient()

            @restrict(refill_rate=5, max_tokens=5)
            def szkt_request(
                image_index,
                image_path=None,
            ):

                try:

                    # 创建影像件路径对象
                    image_path = Path(image_path)

                    # 影像件文件名称后缀
                    image_format = image_path.suffix.strip().lower()

                    # 根据影像件路径读取图像数据（二进制）
                    with open(image_path, "rb") as image:

                        image_data = image.read()

                    # 标准化影像件格式
                    # noinspection PyUnboundLocalVariable
                    image_format = image_format if image_format == ".png" else ".jpeg"

                    # noinspection PyUnboundLocalVariable
                    image_data_base64 = image_compression(image_format, image_data)

                    response = http_client.post(
                        # 增值税发票识别
                        url="https://ai.inspirvision.cn/s/api/ocr/vatInvoice",
                        # 用于和深圳快瞳联查时定位请求
                        headers={"X-RequestId-Header": image_index},
                        data={
                            "token": authenticator.get_token(),
                            "imgBase64": image_data_base64,
                        },
                    )

                except RequestException as request_exception:

                    response = {
                        "status": request_exception.status_code,
                        "message": request_exception.message,
                    }

                except Exception as exception:

                    response = {
                        "status": "90000",
                        "message": f"发生其它异常{exception}",
                    }

                return image_index, response

            # 用于记录已完成任务数
            completed_futures = 0

            # 创建线程池
            with ThreadPoolExecutor(max_workers=5) as executor:

                futures = [
                    executor.submit(
                        szkt_request,
                        image_index=row.索引,
                        image_path=row.影像件地址,
                    )
                    for row in dataframe[["索引", "影像件地址"]].itertuples(
                        index=False, name="row"
                    )
                ]

                for future in as_completed(futures):

                    index, response = future.result()

                    # 保存报文
                    with open(
                        "temporary/responses/{}.json".format(index),
                        "w",
                        encoding="utf-8",
                    ) as file:

                        json.dump(response, file, ensure_ascii=False)

                    completed_futures += 1

                    print(f"已完成 {completed_futures / rows * 100:.2f} %")

        case _:

            print("选择退出脚本!")

            exit(0)

    print("正在解析报文...", end="")

    # 解析后数据体
    dataframe_parsed = []

    # 遍历报文所在目录
    for path_object in list(Path("temporary/responses").glob("*.json")):

        # 解析报文结构
        parse = {
            "索引": "",
            "发票号码": "",
            "小写金额": "",
            "合计金额": "",
            "合计税额": "",
            "购买方名称": "",
            "销售方名称": "",
        }

        try:

            # 打开报文并JSON逆序列化
            with open(path_object, "r", encoding="utf-8") as file:

                response = json.load(file)

            # 索引
            parse["索引"] = path_object.stem

            # 响应状态码
            status_code = response.get("status", "")

            # 错误码
            code = response.get("code", "")

            # 流水号
            serial = response.get("serialNo", "")

            # 若响应状态码为200且错误码为0，则定义为响应成功
            if status_code == 200 and code == 0:

                for item in response.get("data", []):

                    # 根据DESC匹配字段
                    match item.get("desc"):

                        case "发票号码":

                            parse["发票号码"] = item.get("value", "")

                        case "小写金额":

                            parse["小写金额"] = item.get("value").replace("¥", "")

                        case "合计金额":

                            parse["合计金额"] = item.get("value").replace("¥", "")

                        case "合计税额":

                            parse["合计税额"] = item.get("value").replace("¥", "")

                        case "购买方名称":

                            parse["购买方名称"] = item.get("value", "")

                        case "销售方名称":

                            parse["销售方名称"] = item.get("value", "")

            else:

                raise Exception("解析报文发生其它异常")

        except Exception as exception:

            parse["发票号码"] = "{}".format(response.get("message"))

        finally:

            dataframe_parsed.append(parse)

    dataframe_parsed = pandas.DataFrame(data=dataframe_parsed, dtype=str)

    # 将解析数据集拼接至数据集
    dataframe = dataframe.merge(right=dataframe_parsed, how="left", on=["索引"])

    # 填补缺失值
    dataframe = dataframe.fillna(value="")

    print("已完成")

    print("正在保存为工作簿...", end="")

    save_as_workbook(worksheets=[("Sheet1", dataframe)], workbook_name="results.xlsx")

    print("已完成")