Python/票据理赔自动化/main.py

1754 lines
84 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
"""
基于普康票据理赔自动化最小化实现
功能清单
https://liubiren.feishu.cn/docx/WFjTdBpzroUjQvxxrNIcKvGnneh?from=from_copylink
"""
import json
import re
from base64 import b64encode
from datetime import datetime
from decimal import Decimal, ROUND_HALF_UP
from hashlib import md5
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
import cv2
import numpy
from fuzzywuzzy import fuzz
from jinja2 import Environment, FileSystemLoader
from jionlp import parse_location
from zen import ZenDecision, ZenEngine
from utils.client import Authenticator, HTTPClient, SQLiteClient
# -------------------------
# 主逻辑
# -------------------------
if __name__ == "__main__":
# 实例认证器
authenticator = Authenticator()
# 实例请求客户端
http_client = HTTPClient(timeout=300, cache_enabled=True) # 使用缓存
# 初始化工作目录地址对象
directory_path = Path("directory")
# 若不存在则创建
directory_path.mkdir(parents=True, exist_ok=True)
def rule_engine(rule_path: Path) -> ZenDecision:
"""
本地打开并读取规则文件并实例化规则引擎
:param rule_path: 规则文件路径对象
"""
def loader(path):
with open(path, "r", encoding="utf-8") as file:
return file.read()
return ZenEngine({"loader": loader}).get_decision(rule_path.as_posix())
# 影像件识别使能
recognize_enable = rule_engine(Path("rules/影像件识别使能.json"))
class MasterData(SQLiteClient):
"""主数据"""
def __init__(self):
"""
初始化主数据
"""
# 初始化SQLite客户端
super().__init__(database="database.db")
try:
with self:
# 初始化购药及就医机构表
self._execute(
sql="""
CREATE TABLE IF NOT EXISTS institutions
(
--购药及就医机构
institution TEXT PRIMARY KEY,
--购药及就医机构类型
institution_type TEXT NOT NULL,
--所在省
province TEXT NOT NULL,
--所在市
city TEXT NOT NULL
)
"""
)
# 初始化在保被保人表TPA作业系统包括团单、个单和被保人表此处直接整合为宽表
self._execute(
sql="""
CREATE TABLE IF NOT EXISTS insured_person_policies
(
--被保人
insured_person TEXT NOT NULL,
--被保人的证件类型
identity_type TEXT NOT NULL,
--被保人的证件号码
identity_number TEXT NOT NULL,
--与主被保人关系,包括本人和附属(配偶、父母和子女)
relationship TEXT NOT NULL,
--个单号
person_policy TEXT NOT NULL,
--主被保人
master_insured_person TEXT NOT NULL,
--保险起期(取个单和团单起期最大值)
commencement_date REAL NOT NULL,
--保险止期(取个单和团单止期最小值)
termination_date REAL NOT NULL,
--团单号
group_policy TEXT NOT NULL,
--投保公司
insurance_company TEXT NOT NULL,
--保险分公司
insurer_company TEXT NOT NULL,
--联合主键(投保公司+保险分公司+被保人+被保人的证件类型+被保人的证件号码)
PRIMARY KEY (insurance_company, insurer_company, insured_person, identity_type,
identity_number)
)
"""
)
except Exception as exception:
raise RuntimeError(
f"初始化数据库发生异常:{str(exception)}"
) from exception
# noinspection PyShadowingNames
def query_institution_type(self, institution: str) -> Optional[str]:
"""
根据购药及就医机构查询购药及就医机构类型
:param institution: 购药及就医机构
:return: 购药及就医机构类型
"""
# noinspection PyBroadException
try:
with self:
# noinspection SqlResolve
result = self._query_one(
sql="""
SELECT institution_type
FROM institutions
WHERE institution = ?
""",
parameters=(institution,),
)
# TODO: 若购药及就医机构类型为空值则流转至主数据人工处理
if result is None:
raise RuntimeError("购药及就医机构类型为空值")
return result["institution_type"]
except Exception as exception:
raise RuntimeError(
"查询并获取单条购药及就医机构类型发生异常"
) from exception
# noinspection PyShadowingNames
def query_insured_person_records(
self,
insurance_company: str,
insurer_company: str,
insured_person: str,
identity_type: str,
identity_number: str,
) -> Optional[List[Dict[str, Any]]]:
"""
查询并获取多条被保人记录(例如,若夫妻同在投保公司则互为附加被保人,一方被保人记录包括本人和配偶两条)
:param insurance_company: 投保公司
:param insurer_company: 保险分公司
:param insured_person: 被保人
:param identity_type: 被保人的证件类型
:param identity_number: 被保人的证件号码
:return: 被保人记录
"""
# noinspection PyBroadException
try:
with self:
# noinspection SqlResolve
result = self._query_all(
sql="""
SELECT insured_person AS "被保人",
relationship AS "与主被保人关系",
person_policy AS "个单号",
master_insured_person AS "主被保人",
commencement_date AS "保险起期",
termination_date AS "保险止期"
FROM insured_person_policies
WHERE insurance_company = ?
AND insurer_company = ?
AND insured_person = ?
AND identity_type = ?
AND identity_number = ?
""",
parameters=(
insurance_company,
insurer_company,
insured_person,
identity_type,
identity_number,
),
)
return (
None
if result == []
else [
{
k: (
datetime.fromtimestamp(v)
if k in ["保险起期", "保险止期"]
else v
)
for k, v in e.items()
}
for e in result
] # 将保险起期和保险止期由时间戳转为datetime对象
)
except Exception as exception:
raise RuntimeError(
"查询并获取多条个单和被保人记录发生异常"
) from exception
# 实例化主数据
master_data = MasterData()
# 实例化JINJA2环境
environment = Environment(loader=FileSystemLoader("."))
# 添加DATE过滤器
environment.filters["date"] = lambda date: (
date.strftime("%Y-%m-%d") if date else "长期"
)
# 加载赔案档案模版
template = environment.get_template("template.html")
# -------------------------
# 自定义方法
# -------------------------
# noinspection PyShadowingNames
def image_read(
image_path: Path,
) -> Optional[numpy.ndarray | None]:
"""
本地打开并读取影像件
:param image_path: 影像件路径对象
:return: 影像件数组
"""
# noinspection PyBroadException
try:
# 影像件打开并读取(默认转为单通道灰度图)
image_ndarray = cv2.imread(image_path.as_posix(), cv2.IMREAD_GRAYSCALE)
if image_ndarray is None:
raise RuntimeError("影像件打开并读取发生异常")
return image_ndarray
except Exception:
# 若本地打开并读取影像件发生异常则抛出异常(实际作业需从影像件服务器下载并读取影像件,因签收时会转存,故必可下载)
raise RuntimeError("影像件打开并读取发生异常")
# noinspection PyShadowingNames
def image_serialize(image_format: str, image_ndarray: numpy.ndarray) -> str:
"""
影像件序列化
:param image_format: 影像件格式
:param image_ndarray: 影像件数组
:return: 影像件唯一标识
"""
# 按照影像件格式就影像件数组编码
success, image_ndarray_encoded = cv2.imencode(image_format, image_ndarray)
if not success or image_ndarray_encoded is None:
raise RuntimeError("编码为图像字节数组发生异常")
# 将编码后图像数组转为字节流
image_bytes = image_ndarray_encoded.tobytes()
# 生成影像件唯一标识
image_guid = md5(image_bytes).hexdigest().upper()
return image_guid
# noinspection PyShadowingNames
def image_classify(
image_guid: str, image_format: str, image_ndarray: numpy.ndarray
) -> Optional[Tuple[str, str]]:
"""
影像件分类并旋正
:param image_guid: 影像件唯一标识
:param image_format: 影像件格式
:param image_ndarray: 影像件数据
:return: 压缩后影像件BASE64编码和影像件类型
"""
# noinspection PyShadowingNames
def image_compress(
image_format, image_ndarray, image_size_specified=2
) -> Optional[str]:
"""
影像件压缩
:param image_ndarray: 影像件数组
:param image_format: 影像件格式
:param image_size_specified: 指定影像件大小单位为兆字节MB
:return: 压缩后影像件BASE64编码
"""
# 将指定影像件大小单位由兆字节转为字节
image_size_specified = image_size_specified * 1024 * 1024
# 通过调整影像件质量和尺寸达到压缩影像件目的
# 外循环压缩:通过调整影像件质量实现压缩影像件大小
for quality in range(100, 50, -10):
image_ndarray_copy = image_ndarray.copy()
# 内循环压缩:通过调整影像件尺寸实现压缩影像件大小
for i in range(10):
# 按照影像件格式和影像件质量将影像件数组编码
success, image_ndarray_encoded = cv2.imencode(
image_format,
image_ndarray_copy,
params=(
[cv2.IMWRITE_PNG_COMPRESSION, 10 - quality // 10]
if image_format == "png"
else [cv2.IMWRITE_JPEG_QUALITY, quality]
),
)
# 若编码发生异常则停止循环
if not success or image_ndarray_encoded is None:
break
# 影像件BASE64编码
image_base64 = b64encode(image_ndarray_encoded.tobytes()).decode(
"utf-8"
)
if len(image_base64) <= image_size_specified:
return image_base64
# 调整影像件尺寸
image_ndarray_copy = cv2.resize(
image_ndarray_copy,
(
int(image_ndarray_copy.shape[0] * 0.95),
int(image_ndarray_copy.shape[1] * 0.95),
),
interpolation=cv2.INTER_AREA,
)
# 若调整后影像件尺寸中长或宽小于350像素则停止调整影像件尺寸
if min(image_ndarray_copy.shape[:2]) < 350:
break
return None
# 影像件压缩
image_base64 = image_compress(
image_format, image_ndarray, image_size_specified=2
) # 深圳快瞳要求为2兆字节
# TODO: 若影像件压缩发生异常则流转至人工处理
if image_base64 is None:
raise RuntimeError("影像件压缩发生异常")
# 请求深圳快瞳影像件分类接口
response = http_client.post(
url=(url := "https://ai.inspirvision.cn/s/api/ocr/genalClassify"),
headers={
"X-RequestId-Header": image_guid
}, # 以影像件唯一标识作为请求唯一标识,用于双方联查
data={
"token": authenticator.get_token(
servicer="szkt"
), # 获取深圳快瞳访问令牌
"imgBase64": f"data:image/{image_format.lstrip(".")};base64,{image_base64}", # 影像件BASE64编码嵌入数据统一资源标识符
},
guid=md5((url + image_guid).encode("utf-8")).hexdigest().upper(),
)
# 若响应非成功则抛出异常
# TODO: 若响应非成功则流转至人工处理
if not (response.get("status") == 200 and response.get("code") == 0):
raise RuntimeError("请求深圳快瞳影像件分类接口发生异常")
# 解析影像件类型
# noinspection PyTypeChecker
match (response["data"]["flag"], response["data"]["type"]):
case (14, _):
image_type = "居民户口簿"
case (7, "idcard-front-back"):
image_type = "居民身份证(国徽、头像面)"
case (7, "idcard-front"):
image_type = "居民身份证(国徽面)"
case (7, "idcard-back"):
image_type = "居民身份证(头像面)"
case (11, _):
image_type = "中国港澳台地区及境外护照"
case (8, _):
image_type = "银行卡"
case (4, _):
image_type = "增值税发票"
case (1, _):
image_type = "医疗费用清单"
case (5, _):
image_type = "医疗门诊收费票据"
case (3, _):
image_type = "医疗住院收费票据"
case (18, _):
image_type = "理赔申请书"
case _:
image_type = "其它"
# 解析影像件方向
# noinspection PyTypeChecker
image_orientation = {
"0": "0度",
"90": "顺时针90度",
"180": "180度",
"270": "逆时针90度",
}.get(response["data"]["angle"], "0度")
# 若影像件方向非0度则旋正
if image_orientation != "0度":
image_ndarray = cv2.rotate(
image_ndarray,
{
"顺时针90度": cv2.ROTATE_90_COUNTERCLOCKWISE, # 逆时针旋转90度
"180度": cv2.ROTATE_180, # 旋转180度
"逆时针90度": cv2.ROTATE_90_CLOCKWISE, # 顺时针旋转90度
}[image_orientation],
)
# 旋正后影像件再次压缩
image_base64 = image_compress(
image_format, image_ndarray, image_size_specified=2
)
# TODO: 若旋正后影像件再次压缩发生异常则流转至人工处理
if image_base64 is None:
raise RuntimeError("旋正后影像件再次压缩发生异常")
return image_base64, image_type
# noinspection PyShadowingNames
def image_recognize(
image,
insurance_company,
insurer_company,
) -> None:
"""
影像件识别并整合至赔案档案
:param image: 影像件
:param insurance_company: 投保公司
:param insurer_company: 保险分公司
:return: 空
"""
# TODO: 后续添加居民身份证(国徽面)和居民身份证(头像面)合并
# noinspection PyShadowingNames
def identity_card_recognize(image, insurance_company, insurer_company) -> None:
"""
居民身份证识别并整合至赔案档案
:param image: 影像件
:param insurance_company: 投保公司
:param insurer_company: 保险分公司
:return: 空
"""
# noinspection PyShadowingNames
def calculate_age(report_time: datetime, birthday: datetime) -> int:
"""
按照报案时间计算周岁
:param report_time: 报案时间
:param birthday: 出生日期
:return 周岁
"""
# 年龄
age = report_time.year - birthday.year
# 若报案时未到生日则年龄减去1
if (report_time.month, report_time.day) < (
birthday.month,
birthday.day,
):
age -= 1
return age
# 请求深圳快瞳居民身份证识别接口
response = http_client.post(
url=(url := "https://ai.inspirvision.cn/s/api/ocr/identityCard"),
headers={
"X-RequestId-Header": image["影像件唯一标识"]
}, # 以影像件唯一标识作为请求唯一标识,用于双方联查
data={
"token": authenticator.get_token(
servicer="szkt"
), # 获取深圳快瞳访问令牌
"imgBase64": f"data:image/{image["影像件格式"].lstrip(".")};base64,{image["影像件BASE64编码"]}",
}, # 深圳快瞳支持同时识别居民国徽面和头像面
guid=md5((url + image["影像件唯一标识"]).encode("utf-8"))
.hexdigest()
.upper(),
)
# TODO: 若响应非成功则流转至人工处理
if not (response.get("status") == 200 and response.get("code") == 0):
raise RuntimeError("请求深圳快瞳居民身份证识别接口发生异常")
if image["影像件类型"] in [
"居民身份证(国徽、头像面)",
"居民身份证(国徽面)",
]:
# noinspection PyTypeChecker
dossier["出险人层"].update(
{
"有效起期": datetime.strptime(
(period := response["data"]["validDate"].split("-"))[0],
"%Y.%m.%d",
), # 就有效期限解析为有效起期和有效止期。其中若有效止期为长期则默认为9999-12-31
"有效止期": (
datetime(9999, 12, 31)
if period[1] == "长期"
else datetime.strptime(period[1], "%Y.%m.%d")
),
}
)
if image["影像件类型"] in [
"居民身份证(国徽、头像面)",
"居民身份证(头像面)",
]:
# noinspection PyTypeChecker
dossier["出险人层"].update(
{
"姓名": (insured_person := response["data"]["name"]),
"证件类型": (identity_type := "居民身份证"),
"证件号码": (indentity_number := response["data"]["idNo"]),
"性别": response["data"]["sex"],
"出生日期": (
birthday := datetime.strptime(
response["data"]["birthday"], "%Y-%m-%d"
)
), # 深圳快瞳居民身份证识别接口中出生由字符串转为日期,日期格式默认为%Y-%m-%d
"年龄": calculate_age(
dossier["报案层"]["报案时间"], birthday
), # 按照报案时间计算周岁
"所在省": (
address := parse_location(response["data"]["address"])
).get(
"province"
), # 就住址解析为省、市、区和详细地址
"所在市": address.get("city"),
"所在区": address.get("county"),
"详细地址": address.get("detail"),
}
)
# 查询并获取多条被保人记录
insured_person_records = master_data.query_insured_person_records(
insurance_company,
insurer_company,
insured_person, # 出险人和被保人为同一人,视角不同:出险人为理赔,被保人为承保/保全
identity_type,
indentity_number,
)
# TODO: 若查询并获取多条被保人记录发生异常则流转至项目运营岗人工处理
if insured_person_records is None:
raise RuntimeError("查询并获取多条被保人记录发生异常")
dossier["被保人层"] = insured_person_records
# noinspection PyShadowingNames
def application_recognize(image, insurer_company) -> None:
"""
理赔申请书识别并整合至赔案档案
:param image: 影像件
:param insurer_company: 保险分公司
:return: 空
"""
# noinspection PyShadowingNames
def mlm_recognize(image, schema) -> Optional[Dict[str, Any]]:
"""
使用多模态大模型就理赔申请书进行光学字符识别并结构化识别结果
:param image: 影像件
:param schema: 识别结果的JSON格式
:return: 识别结果
"""
# 尝试请求火山引擎多模态大模型接口至就消息内容JSON反序列化
response = http_client.post(
url="https://ark.cn-beijing.volces.com/api/v3/chat/completions",
headers={
"Authorization": "Bearer 2c28ab07-888c-45be-84a2-fc4b2cb5f3f2",
"Content-Type": "application/json; charset=utf-8",
},
json=(
json_ := {
"model": "doubao-seed-1-6-251015",
"messages": [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/{image["影像件格式"].lstrip(".")};base64,{image["影像件BASE64编码"]}"
},
},
{
"type": "text",
"text": "请就理赔申请书进行光学字符识别、结构化识别结果并返回符合Schema的JSON数据",
},
],
}
],
"temperature": 0.2, # 采样温度基于strict平衡稳定和容错
"thinking": {
"type": "disabled",
}, # 不使用深度思考能力
"response_format": {
"type": "json_schema",
"json_schema": {
"name": "就理赔申请书进行光学字符识别并结构化识别结果",
"schema": schema,
"strict": True, # 启用严格遵循模式
},
},
}
),
guid=md5(
json.dumps(
json_,
sort_keys=True,
ensure_ascii=False,
).encode("utf-8")
)
.hexdigest()
.upper(),
)
# 尝试就响应中消息内容JSON反序列化
# noinspection PyBroadException
try:
# noinspection PyTypeChecker
return json.loads(response["choices"][0]["message"]["content"])
except:
return None
# noinspection PyShadowingNames
def boc_application_recognize(image: str) -> None:
"""
就中银保险有限公司的理赔申请书识别并整合至赔案档案
:param image: 影像件
:return: 空
"""
# 识别结果的JSON格式
schema = {
"type": "object",
"description": "识别结果对象",
"properties": {
"申请人": {
"type": "string",
"description": "申请人,若无数据则为空字符串",
},
"性别": {
"type": "string",
"description": "性别,若无数据则为空字符串",
},
"年龄": {
"type": "string",
"description": "年龄,若无数据则为空字符串",
},
"手机": {
"type": "string",
"description": "手机,若无数据则为空字符串",
},
"所属分支行及部门": {
"type": "string",
"description": "所属分支行及部门,若无数据则为空字符串",
},
"身份证号码": {
"type": "string",
"description": "身份证号码,若无数据则为空字符串",
},
"就诊记录": {
"type": "array",
"description": "所有就诊记录数组",
"items": {
"type": "object",
"description": "每条就诊记录对象",
"properties": {
"就诊序号": {
"type": "string",
"description": "就诊序号,若无数据则为空字符串",
},
"发票日期": {
"type": "string",
"description": "发票日期若无数据则为空字符串若有数据则格式为YYYY/MM/DD",
},
"发票上的就诊医院/药店": {
"type": "string",
"description": "发票上的就诊医院/药店,若无数据则为空字符串",
},
"票据张数": {
"type": "string",
"description": "票据张数,若无数据则为空字符串",
},
"票据金额": {
"type": "string",
"description": "票据金额,若无数据则为空字符串,若有数据则保留两位小数",
},
"诊断": {
"type": "string",
"description": "诊断,若无数据则为空字符串",
},
},
"required": [
"发票日期",
"发票上的就诊医院/药店",
"诊断",
], # 就诊记录必须字段
"additionalProperties": False, # 禁止就就诊记录新增属性
},
},
"票据金额合计": {
"type": "string",
"description": "票据金额合计,若无数据则为空字符串,若有数据则保留两位小数",
},
"开户银行": {
"type": "string",
"description": "开户银行,若无数据则为空字符串,请注意开户银行可能为多行",
},
"户名": {
"type": "string",
"description": "户名,若无数据则为空字符串",
},
"账号": {
"type": "string",
"description": "账号,若无数据则为空字符串",
},
},
"required": [
"申请人",
"手机",
"身份证号码",
"就诊记录",
"开户银行",
"户名",
"账号",
], # 识别结果的JSON结构必须字段
"additionalProperties": False, # 禁止就识别结果的JSON结构新增属性
}
# 使用多模态大模型就理赔申请书进行光学字符识别并结构化识别结果
recognition = mlm_recognize(image, schema)
# TODO: 若非成功则流转至人工处理
if recognition is None:
raise RuntimeError(
"就中银保险有限公司的理赔申请书识别并整合至赔案档案发生异常"
)
dossier["受益人层"].update(
{
"开户行": recognition["开户银行"],
"户名": recognition["户名"],
"户号": recognition["账号"],
"手机号": recognition["手机"],
}
)
# 根据保险分公司匹配结构化识别文本方法
match insurer_company:
# 中银保险有限公司
case _ if insurer_company.startswith("中银保险有限公司"):
boc_application_recognize(image)
# noinspection PyShadowingNames
def receipt_recognize(image) -> None:
"""
票据识别并整合至赔案档案
:param image: 影像件
:return: 空
"""
# noinspection PyShadowingNames
def query_value(contents: list, key: str) -> Optional[str]:
"""
就识别结果,根据指定键名查询值
:param contents: 识别结果
:param key: 指定键名
:return 值
需要匹配的键名的键值
"""
# 若识别结果为空列表则返回None
if not contents:
return None
# noinspection PyInconsistentReturns
match contents[0].keys():
# 对应深圳快瞳增值税发票识别结果
case _ if "desc" in contents[0].keys():
# 遍历识别结果,若内容的键名为指定键名则返回值
for content in contents:
if content["desc"] == key:
return content["value"] if content["value"] else None
candidates = []
# 基于加权补偿的莱文斯坦距离算法计算所有内容的键名和指定键名的相似度
for content in contents:
candidates.append(
(
content["value"],
fuzz.WRatio(
content["desc"], key, force_ascii=False
),
)
)
# 返回最大相似度的值
return (
(result[0] if result[0] else None)
if (result := max(candidates, key=lambda x: x[1]))[1] >= 80
else None
)
# 对应深圳快瞳医疗收费票据识别结果
case _ if "name" in contents[0].keys():
# 遍历识别结果,若内容的键名为指定键名则返回值
for content in contents:
if content["name"] == key:
return (
content["word"]["value"]
if content["word"]["value"]
else None
)
candidates = []
# 基于加权补偿的莱文斯坦距离算法计算所有内容的键名和指定键名的相似度
for content in contents:
candidates.append(
(
content["word"]["value"],
fuzz.WRatio(
content["name"], key, force_ascii=False
),
)
)
# 返回最大相似度的值
return (
(result[0] if result[0] else None)
if (result := max(candidates, key=lambda x: x[1]))[1] >= 80
else None
)
def parse_items(contents):
"""
就识别结果,解析明细项
:param contents: 识别结果
:return 解析后的明细项
"""
# noinspection PyInconsistentReturns
match contents[0].keys():
case _ if "desc" in contents[0].keys():
# noinspection PyTypeChecker
return [
{
"名称": name,
"数量": Decimal(quantity).quantize(
Decimal("0.00"),
rounding=ROUND_HALF_UP,
),
"金额": (Decimal(amount) + Decimal(tax)).quantize(
Decimal("0.00"), rounding=ROUND_HALF_UP
), # 深圳快瞳票据识别接口中明细的金额和税额由字符串转为Decimal保留两位小数并求和
}
for name, quantity, amount, tax in zip(
[
x["value"]
for x in contents
if re.match(
r"^项目名称(\d+)?$",
x["desc"],
)
],
[
x["value"]
for x in contents
if re.match(
r"^数量(\d+)?$",
x["desc"],
)
],
[
x["value"]
for x in contents
if re.match(
r"^金额(\d+)?$",
x["desc"],
)
],
[
x["value"]
for x in contents
if re.match(
r"^税额(\d+)?$",
x["desc"],
)
],
)
]
# 初始化票据数据
receipt = {"影像件编号": image["影像件编号"]}
# 请求深圳快瞳票据查验接口(兼容增值税发票、医疗门诊/住院收费票据)
response = http_client.post(
url=(url := "https://ai.inspirvision.cn/s/api/ocr/invoiceCheckAll"),
headers={
"X-RequestId-Header": image["影像件唯一标识"]
}, # 以影像件唯一标识作为请求唯一标识,用于双方联查
data={
"token": authenticator.get_token(
servicer="szkt"
), # 获取深圳快瞳访问令牌
"imgBase64": f"data:image/{image["影像件格式"].lstrip(".")};base64,{image["影像件BASE64编码"]}",
},
guid=md5((url + image["影像件唯一标识"]).encode("utf-8"))
.hexdigest()
.upper(),
)
# 若查验结果为真票或红票则直接整合至赔案档案
if response.get("status") == 200 and response.get("code") == 10000:
# noinspection PyTypeChecker
match response["data"]["productCode"]:
# 增值税发票,目前深圳快瞳支持全电发票和全电纸质发票、区块链发票和增值税发票查验
case "003082":
# noinspection PyTypeChecker
receipt.update(
{
"查验状态": (
"真票"
if response["data"]["details"]["invoiceTypeNo"]
== "0"
else "红票"
), # 红票为状态为失控、作废、已红冲、部分红冲和全额红冲的票据
"票据号": response["data"]["details"]["number"],
"票据代码": (
response["data"]["details"]["code"]
if response["data"]["details"]["code"]
else None
),
"开票日期": datetime.strptime(
response["data"]["details"]["date"], "%Y年%m月%d"
), # 深圳快瞳票据查验接口中开票日期由字符串转为datetime对象
"校验码": response["data"]["details"]["check_code"],
"开票金额": Decimal(
response["data"]["details"]["total"]
).quantize(
Decimal("0.00"),
rounding=ROUND_HALF_UP,
), # 深圳快瞳票据查验接口中开票金额由字符串转为Decimal保留两位小数
"姓名": response["data"]["details"]["buyer"],
"购药及就医机构": response["data"]["details"]["seller"],
"明细项": [
{
"名称": item["name"],
"数量": (
Decimal(item["quantity"]).quantize(
Decimal("0.00"),
rounding=ROUND_HALF_UP,
)
if item["quantity"]
else Decimal("0.00")
), # 深圳快瞳票据查验接口中明细单位由空字符转为None若非空字符由字符串转为Decimal保留两位小数
"金额": (
Decimal(item["total"])
+ Decimal(item["tax"])
).quantize(
Decimal("0.00"),
rounding=ROUND_HALF_UP,
), # 深圳快瞳票据查验接口中明细的金额和税额由字符串转为Decimal保留两位小数并求和
}
for item in response["data"]["details"]["items"]
],
"备注": (
response["data"]["details"]["remark"]
if response["data"]["details"]["remark"]
else None
),
}
)
# 医疗门诊、住院收费票据
case "003081":
# noinspection PyTypeChecker
receipt.update(
{
"查验状态": (
"真票"
if response["data"]["flushedRed"] == "true"
else "红票"
),
"票据号": response["data"]["billNumber"],
"票据代码": response["data"]["billCode"],
"开票日期": datetime.strptime(
response["data"]["invoiceDate"], "%Y-%m-%d %H:%M:%S"
), # 深圳快瞳票据查验接口中开票日期由字符串转为datetime对象
"入院日期": (
datetime.strptime(
response["data"]["hospitalizationDate"].split(
"-"
)[0],
"%Y%m%d",
)
if response["data"]["hospitalizationDate"]
else None
), # 深圳快瞳票据查验接口中住院日期解析为入院日期和出院日期
"出院日期": (
datetime.strptime(
response["data"]["hospitalizationDate"].split(
"-"
)[1],
"%Y%m%d",
)
if response["data"]["hospitalizationDate"]
else None
),
"校验码": response["data"]["checkCode"],
"开票金额": Decimal(
response["data"]["amount"]
).quantize(
Decimal("0.00"),
rounding=ROUND_HALF_UP,
),
"姓名": response["data"]["payer"],
"购药及就医机构": response["data"][
"receivablesInstitution"
],
"明细项": [
{
"名称": item["itemName"],
"数量": Decimal(item["number"]).quantize(
Decimal("0.00"),
rounding=ROUND_HALF_UP,
),
"金额": Decimal(item["totalAmount"]).quantize(
Decimal("0.00"),
rounding=ROUND_HALF_UP,
),
}
for item in response["data"]["feeitems"]
],
"个人自费": Decimal(
response["data"]["personalExpense"]
if response["data"]["personalExpense"]
else Decimal("0.00")
).quantize(
Decimal("0.00"),
rounding=ROUND_HALF_UP,
),
"个人自付": Decimal(
response["data"]["personalPay"]
if response["data"]["personalPay"]
else Decimal("0.00")
).quantize(
Decimal("0.00"),
rounding=ROUND_HALF_UP,
),
"医保支付": (
Decimal(response["data"]["medicarePay"])
if response["data"]["medicarePay"]
else Decimal("0.00")
+ Decimal(
response["data"]["otherPayment"]
if response["data"]["otherPayment"]
else Decimal("0.00")
)
).quantize(
Decimal("0.00"),
rounding=ROUND_HALF_UP,
), # 包括医保统筹基金支付和其它支付(例如,退休补充支付)
}
)
# 若查验结果为假票或无法查验则再请求深圳快瞳票据识别接口接整合至赔案档案
else:
receipt["查验结果"] = (
"假票"
if response.get("status") == 400
and (response.get("code") == 10100 or response.get("code") == 10001)
else "无法查验"
) # 假票:查无此票或查验成功五要素不一致
match image["影像件类型"]:
case "增值税发票":
# 请求深圳快瞳增值税发票识别接口
response = http_client.post(
url=(
url := "https://ai.inspirvision.cn/s/api/ocr/vatInvoice"
),
headers={
"X-RequestId-Header": image["影像件唯一标识"]
}, # 以影像件唯一标识作为请求唯一标识,用于双方联查
data={
"token": authenticator.get_token(
servicer="szkt"
), # 获取深圳快瞳访问令牌
"imgBase64": f"data:image/{image["影像件格式"].lstrip(".")};base64,{image["影像件BASE64编码"]}",
},
guid=md5((url + image["影像件唯一标识"]).encode("utf-8"))
.hexdigest()
.upper(),
)
# TODO: 若响应非成功则流转至人工处理
if not (
response.get("status") == 200 and response.get("code") == 0
):
raise RuntimeError("请求深圳快瞳增值税发票识别接口发生异常")
# noinspection PyTypeChecker
receipt.update(
{
"票据号": query_value(response["data"], "发票号码"),
"票据代码": query_value(response["data"], "发票代码"),
"开票日期": datetime.strptime(
query_value(response["data"], "开票日期"),
"%Y年%m月%d",
),
"校验码": query_value(response["data"], "校验码"),
"开票金额": Decimal(
query_value(response["data"], "小写金额").replace(
"¥", ""
)
).quantize(
Decimal("0.00"),
rounding=ROUND_HALF_UP,
),
"姓名": query_value(response["data"], "购买方名称"),
"购药及就医机构": query_value(
response["data"], "销售方名称"
),
"明细项": parse_items(response["data"]),
"备注": query_value(response["data"], "备注"),
}
)
case "医疗门诊收费票据" | "医疗住院收费票据":
# 请求深圳快瞳医疗收费票据识别接口
response = http_client.post(
url=(url := "https://ai.inspirvision.cn/s/api/ocr/medical"),
headers={
"X-RequestId-Header": image["影像件唯一标识"]
}, # 以影像件唯一标识作为请求唯一标识,用于双方联查
data={
"token": authenticator.get_token(
servicer="szkt"
), # 获取深圳快瞳访问令牌
"imgBase64": f"data:image/{image["影像件格式"].lstrip(".")};base64,{image["影像件BASE64编码"]}",
},
guid=md5((url + image["影像件唯一标识"]).encode("utf-8"))
.hexdigest()
.upper(),
)
# TODO: 若响应非成功则流转至人工处理
if not (
response.get("status") == 200 and response.get("code") == 0
):
raise RuntimeError(
"请求深圳快瞳医疗收费票据识别接口发生异常"
)
# noinspection PyTypeChecker
receipt.update(
{
"票据号": (
receipt := (
response["data"]["insured"][
(
"receipt_hospitalization"
if image["影像件类型"]
== "医疗门诊收费票据"
else "receipt_outpatient"
)
]
)["receipts"][0]
)["receipt_no"][
"value"
], # 默认为第一张票据
"票据代码": receipt["global_detail"]["invoice_code"][
"value"
],
"开票日期": datetime.strptime(
receipt["global_detail"]["invoice_date"]["value"],
"%Y-%m-%d",
),
"校验码": query_value(
receipt["global_detail"]["region_specific"],
"校验码",
),
"开票金额": Decimal(
receipt["total_amount"]["value"]
).quantize(
Decimal("0.00"),
rounding=ROUND_HALF_UP,
),
"姓名": receipt["name"]["value"],
"购药及就医机构": receipt["hospital_name"]["value"],
"明细项": [
{
"名称": field["value"],
"数量": Decimal(field["value"]).quantize(
Decimal("0.00"),
rounding=(
ROUND_HALF_UP
if isinstance(
(field := item["number"]), dict
)
else None
),
),
"金额": (
format(
Decimal(field["value"]).quantize(
Decimal("0.00"),
rounding=ROUND_HALF_UP,
),
".2f",
)
if isinstance(
(field := item["total_amount"]), dict
)
else None
),
}
for item in receipt["feeitems"]
],
"医保支付": (
format(
Decimal(field["value"]).quantize(
Decimal("0.00"),
rounding=ROUND_HALF_UP,
),
".2f",
)
if isinstance(
(field := receipt.get("medicare_pay")), dict
)
else None
),
"其它支付": format(
(
Decimal(value).quantize(
Decimal("0.00"),
rounding=ROUND_HALF_UP,
)
if (
value := fuzzy_match(
target="其它支付",
components=receipt.get(
"global_detail", {}
).get("pay_list", []),
specify_key="name",
return_key="word.value",
)
)
else None
),
".2f",
),
"个人自付": (
format(
Decimal(field["value"]).quantize(
Decimal("0.00"),
rounding=ROUND_HALF_UP,
),
".2f",
)
if isinstance(
(field := receipt.get("self_pay")), dict
)
else None
),
"自付一": (
format(
Decimal(field["value"]).quantize(
Decimal("0.00"),
rounding=ROUND_HALF_UP,
),
".2f",
)
if isinstance(
field := (receipt.get("self_pay_one")), dict
)
else None
),
"自付二": (
format(
Decimal(field["value"]).quantize(
Decimal("0.00"),
rounding=ROUND_HALF_UP,
),
".2f",
)
if isinstance(
field := (receipt.get("self_pay_two")), dict
)
else None
),
"个人自费": (
format(
Decimal(field["value"]).quantize(
Decimal("0.00"),
rounding=ROUND_HALF_UP,
),
".2f",
)
if isinstance(
field := (receipt.get("self_cost")), dict
)
else None
),
"住院日期": (
datetime.strptime(
field["value"], "%Y%m%d"
).strftime("%Y-%m-%d")
if isinstance(
field := (receipt.get("starttime")), dict
)
else None
),
"出院日期": (
datetime.strptime(
field["value"], "%Y%m%d"
).strftime("%Y-%m-%d")
if isinstance(
field := (receipt.get("endtime")), dict
)
else None
),
"医疗机构类型": receipt["others"][
"medical_institution_type"
]["value"],
}
)
# noinspection PyShadowingNames
def bank_card_recognize(image) -> None:
"""
银行卡识别并整合至赔案档案
:param image: 影像件
:return: 空
"""
# 请求深圳快瞳银行卡识别接口
response = http_client.post(
url=(url := "https://ai.inspirvision.cn/s/api/ocr/bankCard"),
headers={"X-RequestId-Header": image["影像件唯一标识"]},
data={
"token": authenticator.get_token(
servicer="szkt"
), # 获取深圳快瞳访问令牌
"imgBase64": f"data:image/{image["影像件格式"].lstrip(".")};base64,{image["影像件BASE64编码"]}",
},
guid=md5((url + image["影像件唯一标识"]).encode("utf-8"))
.hexdigest()
.upper(),
)
# TODO: 若响应非成功则流转至人工处理
if not (
response.get("status") == 200
and response.get("code") == 0
and response.get("data", {}).get("bankCardType")
== 1 # # 实际作业亦仅支持借记卡
):
raise RuntimeError("请求深圳快瞳银行卡识别接口发生异常或非借记卡")
# noinspection PyTypeChecker
dossier["受益人层"].update(
{
"开户行": response["data"]["bankInfo"],
"户名": "",
"户号": response["data"]["cardNo"].replace(" ", ""),
"手机号": "",
}
)
# 影像件识别使能检查,若影像件不识别则跳过
if not recognize_enable.evaluate(
{
"insurer_company": insurer_company,
"image_type": image["影像件类型"],
}
)["result"]["recognize_enable"]:
return
# 根据影像件类型匹配影像件识别方法
match image["影像件类型"]:
# TODO: 后续添加居民户口簿识别和整合方法
case "居民户口簿":
raise RuntimeError("暂不支持居民户口簿")
case (
"居民身份证(国徽、头像面)" | "居民身份证(国徽面)" | "居民身份证(头像面)"
):
# 居民身份证识别并整合至赔案档案
identity_card_recognize(image, insurance_company, insurer_company)
# TODO: 后续添加居民户口簿识别和整合方法
case "中国港澳台地区及境外护照":
raise RuntimeError("暂不支持中国港澳台地区及境外护照")
# TODO: 暂仅支持增值税发票识别且购药及就医类型为药店购药整合至赔案档案,后续逐步添加
case "理赔申请书":
application_recognize(image, insurer_company)
case "增值税发票" | "医疗门诊收费票据" | "医疗住院收费票据":
# 票据识别并整合至赔案档案
receipt_recognize(image)
case "银行卡":
# 银行卡识别并整合至赔案档案
bank_card_recognize(image)
# 遍历工作目录中赔案目录并创建赔案档案(模拟自动化域就待自动化任务创建理赔档案)
for case_path in [x for x in directory_path.iterdir() if x.is_dir()]:
# 初始化赔案档案保险公司将提供投保公司、保险分公司和报案时间等TPA作业系统签收后生成赔案号
dossier = {
"报案层": {
"投保公司": (
insurance_company := "中国银行股份有限公司昆山分行"
), # 指定投保公司
"保险分公司": (
insurer_company := "中银保险有限公司苏州分公司"
), # 指定保险分公司
"报案时间": datetime(2025, 7, 25, 12, 0, 0), # 指定报案时间
"赔案号": (case_number := case_path.stem), # 设定:赔案目录名称为赔案号
},
"影像件层": [],
"出险人层": {},
"被保人层": [],
"受益人层": {},
}
# 遍历赔案目录中影像件
for image_index, image_path in enumerate(
sorted(
[
x
for x in case_path.glob(pattern="*")
if x.is_file() and x.suffix.lower() in [".jpg", ".jpeg", ".png"]
], # 实际作业亦仅支持JPG、JPEG或PNG
key=lambda x: x.stat().st_ctime, # 根据影像件创建时间顺序排序
),
1,
):
# 初始化影像件数据
image = {
"影像件编号": image_index,
"影像件地址": image_path.as_posix(), # 将影像件路径对象转为字符串
"影像件名称": image_path.stem,
"影像件格式": (image_format := image_path.suffix.lower()),
}
# 本地打开并读取影像件
image_ndarray = image_read(image_path)
# 影像件序列化
# noinspection PyTypeChecker
image["影像件唯一标识"] = (
image_guid := image_serialize(image_format, image_ndarray)
)
# 影像件分类并旋正(较初审自动化无使能检查)
image_base64, image_type = image_classify(
image_guid, image_format, image_ndarray
)
image["影像件BASE64编码"] = image_base64
image["影像件类型"] = image_type
dossier["影像件层"].append(image)
# 就影像件按照影像件类型排序
dossier["影像件层"].sort(
key=lambda x: [
"居民户口簿",
"居民身份证(国徽面)",
"居民身份证(头像面)",
"居民身份证(国徽、头像面)",
"中国港澳台地区及境外护照",
"理赔申请书",
"增值税发票",
"医疗门诊收费票据",
"医疗住院收费票据",
"医疗费用清单",
"银行卡",
"其它",
].index(x["影像件类型"])
) # 优先居民户口簿、居民身份证、中国港澳台地区及境外护照和理赔申请书以查询被保人信息
# 遍历影像件层中影像件
for image in dossier["影像件层"]:
# 影像件识别并整合至赔案档案
image_recognize(
image,
insurance_company,
insurer_company,
)
"""
case "增值税发票" | "门诊收费票据" | "住院收费票据":
extraction = invoice_extraction()
# 若发生异常则跳过该影像件
if extraction is None:
dossier["影像件层"][-1]["已识别"] = "否,无法识别"
continue
dossier["发票层"].append(
{
"关联影像件序号": image_index,
"票据类型": extraction["票据类型"],
"票据号码": extraction["票据号码"],
"票据代码": (
extraction["票据代码"]
if extraction["票据代码"]
else "--"
), # 数电票无票据代码,校验码同票据号码
"开票日期": datetime.strptime(
extraction["开票日期"], "%Y-%m-%d"
),
"校验码后六位": (
check_code[-6:]
if (check_code := extraction["校验码"])
else "--"
),
"医药机构": extraction["收款方"],
"就诊人": (
match.group("name")
if (
match := re.search(
r"^(?P<name>[^(]+)", extraction["付款方"]
)
)
else extraction["付款方"]
),
"票据金额": Decimal(extraction["票据金额"]).quantize(
Decimal("0.00"), rounding=ROUND_HALF_UP
), # 默认金额转为小数,保留两位小数
"查验状态": extraction["查验状态"],
"项目": (
pandas.DataFrame(extraction["项目"])
.assign(
数量=lambda dataframe: dataframe["数量"].apply(
lambda row: (
Decimal(row).quantize(
Decimal("0.00"),
rounding=ROUND_HALF_UP,
)
if row
else Decimal("0.00")
)
),
金额=lambda dataframe: dataframe["金额"].apply(
lambda row: (
Decimal(row).quantize(
Decimal("0.00"),
rounding=ROUND_HALF_UP,
)
if row
else Decimal("0.00")
)
),
)
.groupby(by="名称", as_index=False)
.agg(数量=("数量", "sum"), 金额=("金额", "sum"))
.assign(
大项=lambda dataframe: dataframe["名称"].apply(
lambda row: (
match.group("category")
if (
match := re.match(
r"^\*(?P<category>.+?)\*.*$",
row,
)
)
else row
)
),
小项=lambda dataframe: dataframe["名称"].apply(
lambda row: (
re.sub(
r"[^\u4e00-\u9fa5a-zA-Z0-9./%*]",
"",
match.group("name"),
)
if (
match := re.match(
r"^\*.+?\*(?:\[[^]]+])?(?P<name>[^\s(]+)(?:\([^\s(]+\))?(?:.*?)?$",
row,
)
)
else ""
)
),
)
.loc[
lambda dataframe: dataframe["金额"] != 0,
["名称", "大项", "小项", "数量", "金额"],
]
.to_dict(orient="records")
),
"就诊类型": (
"药店购药"
if "增值税发票" in image_type
else (
"门诊就诊"
if "门诊收费票据" in image_type
else "住院治疗"
)
),
}
)
case "理赔申请书":
# 根据保险总公司匹配理赔申请书
# noinspection PyUnreachableCode
match insurer:
case "中银保险有限公司":
extraction = common_extraction(
application_form="中行员工福利保障计划索赔申请书"
)
# 若识别异常则跳过该影像件
if extraction is None:
dossier["影像件层"][-1]["已识别"] = "否,无法识别"
continue
dossier["赔案层"]["申请人信息"].update(
{
"与被保险人关系": "本人", # 中银保险有限公司:默认申请人与被保险人关系为本人
"年龄": (
Decimal(age).quantize(
Decimal("0"),
rounding=ROUND_HALF_UP,
)
if (
age := extraction.get("基础信息", {}).get(
"年龄", "--"
)
).isdigit()
else age
), # 若年龄仅数字则转为小数、取整,否则默认为“--”
"手机号": (
phone_number
if re.match(
r"^1[3-9]\d{9}$",
phone_number := extraction.get(
"基础信息", {}
).get("手机", "--"),
)
else phone_number
), # 若手机未正则匹配手机号格式则为“--”
}
)
dossier["赔案层"]["受益人信息"].update(
{
"与被保险人关系": "本人", # 中银保险有限公司:默认受益人与被保人关系为本人
"户名": (
account_name
if (
account_name := extraction.get(
"基础信息", {}
).get("户名")
)
else "--"
), # 若户名为NONE则为“--”
"开户银行": (
account_name
if (
account_name := extraction.get(
"基础信息", {}
).get("开户银行")
)
else "--"
), # 若开户银行为NONE则为“--”
"银行账号": (
account_name
if (
account_name := extraction.get(
"基础信息", {}
).get("账号")
)
is not None
else "--"
), # 若银行账号为NONE则为“--”
}
)
dossier["赔案层"]["其它信息"]["自述症状"] = (
("".join(diagnoses))
if (
diagnoses := sorted(
set(
"".join(
[
diagnosis
for invoice in extraction.get(
"票据表格", []
)
if (
diagnosis := invoice.get("诊断")
)
]
).split("")
)
)
)
else "--"
)
case _:
dossier["影像件层"][-1]["已识别"] = "否,无法识别"
continue
case _:
dossier["影像件层"][-1]["已识别"] = "否,无法识别"
continue
dossier["影像件层"][-1].update(
{
"已识别": "",
"识别结果": extraction,
}
)
# 发票层根据开票日期顺序排序
dossier["发票层"] = sorted(
dossier["发票层"], key=lambda x: (x["开票日期"], x["票据号码"])
)
# 构建小项层
# noinspection PyTypeChecker
dossier["小项层"] = (
pandas.DataFrame(
[
{
"小项": item["小项"],
"数量": item["数量"],
"金额": item["金额"],
}
for invoice in dossier["发票层"]
for item in invoice["项目"]
]
)
.groupby(by="小项", as_index=False)
.agg(数量=("数量", "sum"), 金额=("金额", "sum"))
.to_dict(orient="records")
)
for invoice in dossier["发票层"]:
# noinspection PyTypeChecker
invoice["推定疾病"] = disease_diagnosis(
items="".join(sorted(set([item["小项"] for item in invoice["项目"]])))
)
print(dossier)
exit()
with open(f"dossiers/{case_number}.html", "w", encoding="utf-8") as file:
file.write(
template.render(
{
"dossier": dossier,
}
)
)
"""