Python/普康健康发票查验/main.py

# -*- coding: utf-8 -*-

"""
普康健康_发票查验
"""

# 加载模块

import hashlib
import json
import shutil
import uuid
from base64 import b64decode, b64encode
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path

import cv2
import numpy
import pandas

from utils.client import Authenticator, HTTPClient, RequestException, restrict
from utils.pandas_extension import open_csv, save_as_workbook, traverse_directory


# 影像件压缩
def image_compression(
    image_path: str | None = None,
    image_format: str | None = None,
    image_data: bytes | None = None,  # 数据类型为包含图像文件的二进制数据的字节串
    image_size_specified: int = 2,  # 指定影像件大小
    raw: bool = False,  # 250804新增返回是否为完整URI数据格式
) -> str | None:

    try:

        # 若影像件路径数据类型为STR则创建路径对象
        if isinstance(image_path, str):
            image_path = Path(image_path)
            # 影像件文件名称后缀
            image_format = image_path.suffix.strip().lstrip(".").lower()

            # 读取影像件数据
            with open(image_path, "rb") as image:
                image_data = image.read()

        # 影像件数据BASE64编码
        image_data_base64 = b64encode(image_data).decode("utf-8")

        # 指定影像件大小的单位由MB转为KB
        image_size_specified = image_size_specified * 1024 * 1024

        # 若影像件大小小于指定影像件大小则返回BASE64编码后影像件数据
        if len(image_data_base64) < image_size_specified:
            if raw:
                # 返回非完整URI数据格式
                return image_data_base64
            else:
                # 返回完整URI数据格式
                return f"data:image/{image_format};base64,{image_data_base64}"

        # OPENCV解码（数据类型为NUMPY-UINT8）
        image_data_cv2 = cv2.imdecode(
            numpy.frombuffer(image_data, numpy.uint8), cv2.IMREAD_COLOR
        )

        # 若OPENCV解码失败则抛出异常
        if image_data_cv2 is None:
            raise RuntimeError(f"OPENCV解码发生异常")

        # 初始化近似BASE64编码后影像件数据
        proximate_image_data_base64 = None

        # 初始化最小压缩前后影像件大小差值
        min_image_size_difference = float("inf")

        # 基于双层压缩方法：先外层降低图像质量，再内层缩小图像尺寸
        for quality in range(90, 0, -10):

            image_data_cv2_ = image_data_cv2.copy()

            # 根据影像件格式匹配图片质量配置
            # noinspection PyUnreachableCode
            match image_format:
                case "png":
                    encoding_params = [cv2.IMWRITE_PNG_COMPRESSION, 10 - quality // 10]
                case _:
                    encoding_params = [cv2.IMWRITE_JPEG_QUALITY, quality]

            for i in range(25):
                # 降低图像质量
                # noinspection PyTypeChecker
                success, image_data_encoded = cv2.imencode(
                    image_format, image_data_cv2_, encoding_params
                )  # 图像编码
                # 若图像编码失败则退出
                if not success:
                    break
                image_data_base64 = b64encode(image_data_encoded.tobytes()).decode(
                    "utf-8"
                )

                # 压缩前后影像件大小差值
                image_size_difference = len(image_data_base64) - image_size_specified

                if image_size_difference <= 0:
                    if raw:
                        return image_data_base64
                    else:
                        return f"data:image/{image_format};base64,{image_data_base64}"

                if image_size_difference < min_image_size_difference:
                    min_image_size_difference = image_size_difference
                    proximate_image_data_base64 = image_data_base64

                # 影像件高度和宽度
                image_height, image_weight = image_data_cv2_.shape[:2]

                # 若仍超过影像件指定大小则调整图像尺寸
                image_data_cv2_ = cv2.resize(
                    image_data_cv2_,
                    dsize=(int(image_weight * 0.9), int(image_height * 0.9)),
                    interpolation=cv2.INTER_AREA,
                )

        if proximate_image_data_base64:
            if raw:
                return proximate_image_data_base64
            else:
                return f"data:image/{image_format};base64,{image_data_base64}"
        else:
            raise RuntimeError("影像件压缩失败")

    except:
        return None


# 票据查验接口（需要）
@restrict(refill_rate=5, max_tokens=5)  # 限速至5QPS
def invoices_verification(
    image_index,
    image_path=None,
    invoice_number=None,
    invoice_code=None,
    invoice_check_code=None,
    invoice_date=None,
    invoice_amount=None,
    id_number=None,
    process_mode=None,
    supplier=None,
):

    try:

        # 若影像件地址非空则imgBASE64请求，否则根据发票五要素请求
        if image_path:

            match process_mode:
                case "通过影像件本地地址":

                    # 创建路径对象
                    image_path = Path(image_path)

                    # 影像件文件名称后缀
                    image_format = image_path.suffix.strip().lstrip(".").lower()

                    # 读取影像件数据
                    with open(image_path, "rb") as image:
                        image_data = image.read()

                case "通过影像件对象服务器地址":
                    image_format, image_data = http_client.download(url=image_path)

            # 断定影像件格式为JGP、JPEG或者PNG
            # noinspection PyUnboundLocalVariable
            assert image_format in [
                "jpg",
                "jpeg",
                "png",
            ], f"影像件格式({image_format})不支持"

            match supplier:
                case "szkt":

                    image_data_base64 = image_compression(
                        image_format=image_format, image_data=image_data
                    )

                    # noinspection PyUnusedLocal
                    response = http_client.post(
                        # 深圳快瞳增值税发票、医疗发票查验兼容版
                        url="https://ai.inspirvision.cn/s/api/ocr/invoiceCheckAll",
                        # 用于和深圳快瞳联查时定位请求
                        headers={"X-RequestId-Header": image_index},
                        data={
                            "token": authenticator.get_token(servicer="szkt"),
                            "imgBase64": image_data_base64,
                        },
                    )
                case "bjfd":

                    image_data_base64 = image_compression(
                        image_format=image_format, image_data=image_data, raw=True
                    )  # 北京分单不支持完整URI数据格式

                    # 业务入参，序列化并BASE64编码
                    data = b64encode(
                        json.dumps(
                            {
                                "fileByte": image_data_base64,
                                "fileType": (
                                    "png" if image_format == "png" else "jpg"
                                ),  # 北京分单影像件格式支持JPG、PNG或PDF（本脚本暂不支持PDF）
                            }
                        ).encode("utf-8")
                    ).decode("utf-8")

                    # 应用账号
                    appid = "mbYr11Rc_42"

                    # 随机标识
                    noise = image_index

                    # 版本号
                    version = "1.0"

                    # 装配签名
                    sign = (
                        hashlib.md5(
                            f"appid={appid}&data={data}&noise={noise}&key=80357535c95333c3b133dfe5533f6334fe5e9321&version={version}".encode(
                                "utf-8"
                            )
                        )
                        .hexdigest()
                        .upper()
                    )

                    # noinspection PyUnusedLocal
                    response = http_client.post(
                        # 北京分单增值税发票、医疗票据二维码查验接口
                        url="https://api.fendanyun.com/rsx/api/checkByQRCode",
                        headers={"Content-Type": "application/json; charset=utf-8"},
                        json={
                            "appid": appid,
                            "data": data,
                            "noise": noise,
                            "version": version,
                            "sign": sign,
                        },
                    )

        else:

            response = http_client.post(
                # 深圳快瞳增值税发票、医疗发票查验兼容版
                url="https://ai.inspirvision.cn/s/api/ocr/invoiceCheckAll",
                data={
                    "token": authenticator.get_token(servicer="szkt"),
                    "invoiceNumber": invoice_number,
                    "invoiceCode": invoice_code,
                    "checkCode": invoice_check_code,
                    "invoicingDate": invoice_date,
                    "pretaxAmount": invoice_amount,
                    "idCardNo": id_number,
                },
            )

    except RequestException as request_exception:
        response = {
            "status": request_exception.status,
            "code": request_exception.code,
            "message": request_exception.message,
        }

    except Exception as exception:
        response = {
            "code": "40000",
            "message": f"发生其它异常{exception}",
        }

    return image_index, response


if __name__ == "__main__":

    print("已启动批量票据查验")

    match input("请选择票据查验供应商(1:深圳快瞳，2:北京分单，其它任意字符:退出脚本):"):
        case "1":
            supplier = "szkt"
        case "2":
            supplier = "bjfd"
        case _:
            print("选择退出脚本!")
            exit(0)

    match input(
        "请选择处理流程(1:批量解析已归档响应报文，2:根据影像件地址或票据五要素批量查验，其它任意字符:退出脚本):"
    ):
        case "1":
            # 打开前置影像件索引CSV文件
            dataframe = open_csv(file_name="dataframe_indexed.csv")
        case "2":
            print("正在归档响应报文...", end="")

            # 创建响应报文目录路径对象
            responses_path = Path("temporary/responses")

            # 若响应报文目录路径不存在则创建
            if not responses_path.exists():
                responses_path.mkdir(parents=True, exist_ok=True)

            # 创建归档响应报文目录路径对象
            archives_path = Path("temporary/archives")

            # 若归档响应报文目录路径不存在则创建
            if not archives_path.exists():
                archives_path.mkdir(parents=True, exist_ok=True)

            # 遍历响应报文目录下所有文件名后缀为JSON的文件路径
            for file_path in Path(responses_path).glob("*.json"):
                # 若文件路径为文件
                if file_path.is_file():
                    # 移动响应报文由响应报文目录至归档响应报文目录
                    shutil.move(str(file_path), str(archives_path / file_path.name))

            print("已完成")

            match input(
                "请选择批量查验方法(1:通过影像件本地地址，2:通过影像件对象服务器地址，3:通过增值税发票和医疗票据的五要素，其它任意字符:退出脚本):"
            ):
                case "1":
                    print("正在读取影像件本地地址...", end="")

                    dataframe = traverse_directory(
                        directory_path="待查验发票", suffixes=[".jpg", ".jpeg", ".png"]
                    )

                    # 修改列名相对路径为影像件地址
                    dataframe.rename(columns={"相对路径": "影像件地址"}, inplace=True)

                    process_mode = "通过影像件本地地址"

                case "2":
                    print("正在读取影像件对象服务器地址...", end="")

                    dataframe = open_csv(file_name="dataframe.csv")

                    # 断定列名包括赔案编号、发票编号和影像件地址
                    assert all(
                        [
                            column_name in dataframe.columns
                            for column_name in ["赔案编号", "发票编号", "影像件地址"]
                        ]
                    ), "CSV文件中列名必须包括赔案编号、发票编号和影像件地址"

                    # 根据赔案编号和发票编号去重
                    dataframe.drop_duplicates(
                        subset=["赔案编号", "发票编号"], keep="first", inplace=True
                    )

                    # 处理方式
                    process_mode = "通过影像件对象服务器地址"

                case "3":
                    print("正在读取增值税发票和医疗票据的五要素...", end="")

                    dataframe = open_csv(file_name="dataframe.csv")

                    # 断定列名包括身份证号码后六位、发票编号、发票代码、校验号码后六位、开票日期和发票金额
                    assert all(
                        [
                            column_name in dataframe.columns
                            for column_name in [
                                "身份证号码后六位",
                                "发票编号",
                                "发票代码",
                                "校验号码后六位",
                                "开票日期",
                                "发票金额",
                            ]
                        ]
                    ), "CSV文件中列名必须包括身份证号码后六位、发票编号、发票代码、校验号码后六位、开票日期和发票金额"

                    # 根据身份证号码后六位、发票编号、发票代码、校验号码后六位、开票日期和发票金额去重
                    dataframe.drop_duplicates(
                        subset=[
                            "身份证号码后六位",
                            "发票编号",
                            "发票代码",
                            "校验号码后六位",
                            "开票日期",
                            "发票金额",
                        ],
                        keep="first",
                        inplace=True,
                    )

                    # 格式化开票日期
                    dataframe["开票日期"] = dataframe["开票日期"].str.replace(
                        "-", "", regex=False
                    )

                    # 处理方式
                    process_mode = "通过增值税发票和医疗票据的五要素"

                case _:
                    print("选择退出脚本!")
                    exit(0)

            # 统计待查验发票张数
            rows = dataframe.shape[0]

            # 若待查验发票张数为0则退出脚本
            if rows == 0:
                print("待查验发票张数为0，退出脚本")
                exit(0)

            print(f"已完成，待查验发票张数为 {rows}")

            # 添加索引
            dataframe["索引"] = dataframe.apply(
                lambda x: uuid.uuid4().hex, axis="columns"
            )

            dataframe.to_csv("dataframe_indexed.csv", index=False)

            # 创建深圳快瞳获取访问令牌方法
            authenticator = Authenticator()

            # 初始化请求客户端
            http_client = HTTPClient()

            # 用于记录已完成任务数
            completed_futures = 0

            # 创建线程池
            with ThreadPoolExecutor(max_workers=5) as executor:

                # noinspection PyUnreachableCode
                # noinspection PyUnboundLocalVariable
                match process_mode:

                    case "通过影像件本地地址" | "通过影像件对象服务器地址":

                        futures = [
                            executor.submit(
                                invoices_verification,
                                image_index=row.索引,
                                image_path=row.影像件地址,
                                process_mode=process_mode,
                                supplier=supplier,
                            )
                            for row in dataframe[["索引", "影像件地址"]].itertuples(
                                index=False, name="row"
                            )
                        ]

                    case "通过增值税发票和医疗票据的五要素":

                        # 提交任务
                        futures = [
                            executor.submit(
                                invoices_verification,
                                image_index=row.索引,
                                invoice_number=row.发票编号,
                                invoice_code=row.发票代码,
                                invoice_check_code=row.校验号码后六位,
                                invoice_date=row.开票日期,
                                invoice_amount=row.发票金额,
                                id_number=row.身份证号码后六位,
                                process_mode=process_mode,
                                supplier=supplier,
                            )
                            for row in dataframe[
                                [
                                    "索引",
                                    "发票编号",
                                    "发票代码",
                                    "校验号码后六位",
                                    "开票日期",
                                    "发票金额",
                                    "身份证号码后六位",
                                ]
                            ].itertuples(index=False, name="row")
                        ]

                for future in as_completed(futures):
                    index, response = future.result()

                    # 保存报文
                    with open(
                        "temporary/responses/{}.json".format(index),
                        "w",
                        encoding="utf-8",
                    ) as file:
                        json.dump(response, file, ensure_ascii=False)

                    completed_futures += 1

                    print(f"已完成 {completed_futures / rows * 100:.2f} %")

        case _:
            print("选择退出脚本!")
            exit(0)

    print("正在解析报文...", end="")

    # 解析后数据体
    dataframe_parsed = []

    # 遍历报文所在目录
    for path_object in list(Path("temporary/responses").glob("*.json")):

        # 解析报文结构
        parse = {
            "索引": "",
            "机打发票号码": "",
            "发票金额": "",
            "购买方": "",
            "销售方": "",
            "发票状态": "",
            "最大销售项目名称": "",
            "最大销售项目数量": "",
            "XML版式文件": "",
        }

        # 若路径对象包含下划线则在解析报文结构添加赔案编号和发票编号
        if "_" in path_object.stem:

            parse["赔案编号"] = path_object.stem.split("_")[0]

            parse["发票编号"] = path_object.stem.split("_")[1]

        # 打开报文并JSON逆序列化
        with open(path_object, "r", encoding="utf-8") as file:
            response = json.load(file)

        # 索引
        parse["索引"] = path_object.stem

        match supplier:
            case "szkt":
                try:

                    # 响应状态码
                    status_code = response.get("status", "")

                    # 错误码
                    code = response.get("code", "")

                    # 流水号
                    serial = response.get("serialNo", "")

                    # 若响应状态码为200且错误码为10000，则定义为响应成功
                    if status_code == 200 and code == 10000:

                        # 查验类型，若查验类型为003081则为医疗票据查验，003082则为增值税发票查验，两者报文结构不一致
                        match response.get("data").get(
                            "productCode"
                        ):  # 若响应成功则必定存在键DATA和PRODUCTCODE
                            # 解析医疗票据查验结果
                            case "003081":

                                parse["机打发票号码"] = response.get("data").get(
                                    "billNumber"
                                )

                                parse["校验码"] = response.get("data").get("checkCode")

                                parse["发票金额"] = response.get("data").get("amount")

                                parse["购买方"] = response.get("data").get("payer")

                                parse["销售方"] = response.get("data").get("payeeName")

                                # 发票状态
                                match response.get("data").get("flushedRed"):
                                    case "true":
                                        parse["发票状态"] = "正常"
                                    case "false":
                                        parse["发票状态"] = "已红冲"

                                # 最大销售项目数量
                                max_item_quantity = 0

                                # 遍历销售项目列表
                                for item in response.get("data").get("feeitems", []):
                                    # 销售项目数量
                                    item_quantity = item.get("number")

                                    # 若销售项目数量非空，进一步判断是否包含斜杠，若包含斜杠则分割并取第一部分，最后转为浮点
                                    if item_quantity:
                                        if "/" in item_quantity:
                                            item_quantity = item_quantity.split("/")[0]

                                        item_quantity = float(item_quantity)
                                    else:
                                        item_quantity = 1

                                    if item_quantity > max_item_quantity:
                                        parse["最大销售项目名称"] = item.get(
                                            "itemName", ""
                                        )

                                        parse["最大销售项目数量"] = str(item_quantity)

                                parse["XML版式文件"] = response.get("PDFInfo", {}).get(
                                    "fileUrl"
                                )

                            # 解析增值税发票查验结果
                            case "003082":

                                parse["机打发票号码"] = (
                                    response.get("data").get("details").get("number")
                                )

                                parse["校验码"] = (
                                    response.get("data")
                                    .get("details")
                                    .get("check_code")
                                )

                                parse["发票金额"] = (
                                    response.get("data").get("details").get("total")
                                )

                                parse["购买方"] = (
                                    response.get("data").get("details").get("buyer")
                                )

                                parse["销售方"] = (
                                    response.get("data").get("details").get("seller")
                                )

                                # 发票状态
                                match response.get("data").get("details").get(
                                    "invoiceTypeNo"
                                ):
                                    case "0":
                                        parse["发票状态"] = "正常"
                                    case "1":
                                        parse["发票状态"] = "无法查验"
                                    case "2" | "3" | "7" | "8":
                                        parse["发票状态"] = "已红冲"

                                max_item_quantity = 0

                                items = (
                                    response.get("data").get("details").get("items", [])
                                )

                                for item in items:
                                    item_quantity = (
                                        float(item.get("quantity", 1))
                                        if item.get("quantity")
                                        else 1
                                    )

                                    if item_quantity > max_item_quantity:
                                        parse["最大销售项目名称"] = item.get("name")

                                        parse["最大销售项目数量"] = str(item_quantity)

                                # XML版式文件（25-06-11本接口不在提供版式文件，通过另一接口可获取数电增值税发票版式文件）
                                parse["XML版式文件"] = "本接口不再提供版式文件"

                    # 若响应状态码为400且错误码为10001或10100，则定义为假票
                    elif status_code == 400 and (code == 10001 or code == 10100):
                        parse["发票状态"] = "假票"

                    else:
                        raise Exception("解析报文发生其它异常")

                except Exception as exception:

                    parse["发票状态"] = "{}".format(response.get("message"))

            case "bjfd":
                try:

                    # 不验签，业务出参BASE64解码并反序列化
                    response = json.loads(
                        b64decode(response.get("data")).decode("utf-8")
                    )

                    # 增值税发票、医疗票据查验结果BASE64解码并反序列化
                    response["message"] = json.loads(
                        b64decode(response.get("message")).decode("utf-8")
                    )

                    # 错误码
                    code = response.get("result")

                    # 流水号
                    serial = response.get("message").get("checkId")

                    # 核验结果代码
                    result_code = response.get("message").get("resultCode")

                    # 若错误码为S0000则定义为响应成功
                    if code == "S0000":
                        # noinspection PyUnreachableCode
                        match result_code:
                            # 若查验成功则根据增值税发票、医疗票据状态匹配发票状态
                            case "200":

                                parse["机打发票号码"] = (
                                    response.get("message")
                                    .get("tickMainInfo")
                                    .get("invoiceNo")
                                )

                                parse["发票金额"] = (
                                    response.get("message")
                                    .get("tickMainInfo")
                                    .get("invoiceTotalPrice")
                                )

                                parse["购买方"] = (
                                    response.get("message")
                                    .get("tickMainInfo")
                                    .get("payerPartyName")
                                )

                                parse["销售方"] = (
                                    response.get("message")
                                    .get("tickMainInfo")
                                    .get("invoicingPartyName")
                                )

                                max_item_quantity = 0

                                for item in (
                                    response.get("message")
                                    .get("tickMainInfo")
                                    .get("chargeItems", [])
                                ):
                                    item_quantity = (
                                        float(item.get("num", 1))
                                        if item.get("num")
                                        else 1
                                    )

                                    if item_quantity > max_item_quantity:
                                        parse["最大销售项目名称"] = item.get(
                                            "chargeName"
                                        )
                                        parse["最大销售项目数量"] = str(item_quantity)

                                match response.get("message").get("invoiceStatus"):
                                    case "0":
                                        parse["发票状态"] = "正常"
                                    case (
                                        "1" | "2"
                                    ):  # 沿用深圳快瞳解析规则，北京分单已开红票和已作废映射为已红冲
                                        parse["发票状态"] = "已红冲"

                            case "E20003" | "E20007	":
                                parse["发票状态"] = "假票"

                            # 其它情况发票状态根据核验结果描述
                            case _:
                                parse["发票状态"] = response.get("message").get(
                                    "resultMsg"
                                )

                except Exception as exception:
                    parse["发票状态"] = str(exception)

        dataframe_parsed.append(parse)

    dataframe_parsed = pandas.DataFrame(data=dataframe_parsed, dtype=str)

    # 将解析数据集拼接至数据集
    dataframe = dataframe.merge(right=dataframe_parsed, how="left", on=["索引"])

    # 填补缺失值
    dataframe = dataframe.fillna(value="")

    print("已完成")

    print("正在保存为工作簿...", end="")

    save_as_workbook(worksheets=[("Sheet1", dataframe)], workbook_name="results.xlsx")

    print("已完成")