1568 lines
69 KiB
Python
1568 lines
69 KiB
Python
# -*- coding: utf-8 -*-
|
||
|
||
"""
|
||
普康健康_自动化录入
|
||
|
||
--优先使用深圳快瞳,就增值税发票、医疗发票优先使用深圳快瞳票据查验、其次使用深圳快瞳票据识别,最后使用本地识别
|
||
--优先考虑增值税发票
|
||
|
||
"""
|
||
import hashlib
|
||
import json
|
||
import re
|
||
import uuid
|
||
from base64 import b64encode
|
||
from datetime import datetime
|
||
from decimal import Decimal, ROUND_HALF_UP
|
||
from pathlib import Path
|
||
|
||
import cv2
|
||
import numpy
|
||
import pandas
|
||
from dateutil.parser import parse
|
||
from jinja2 import Environment, FileSystemLoader
|
||
from jionlp import parse_location
|
||
from zen import ZenDecision, ZenEngine
|
||
|
||
from utils.client import Authenticator, HTTPClient
|
||
from utils.ocr import fuzzy_match
|
||
|
||
|
||
# -------------------------
|
||
# 工具函数
|
||
# -------------------------
|
||
|
||
|
||
def images_compression(**kwargs) -> tuple[str | None, str | None]:
|
||
"""影像件压缩并BASE64编码"""
|
||
|
||
# 影像件格式
|
||
image_format = kwargs.get("image_format", globals()["image_format"])
|
||
if image_format is None:
|
||
return None, None
|
||
|
||
# 影像件
|
||
image = kwargs.get("image", globals()["image"])
|
||
if image is None:
|
||
return None, None
|
||
|
||
# 编码为图像字节流
|
||
success, image_bytes = cv2.imencode(ext=f".{image_format}", img=image)
|
||
# 若发生异常则返回NONE
|
||
if not success or image_bytes is None:
|
||
return None, None
|
||
|
||
# 生成影像件唯一标识
|
||
image_guid = hashlib.md5(image_bytes.tobytes()).hexdigest().upper()
|
||
|
||
# BASE64编码
|
||
image_base64 = b64encode(image_bytes.tobytes()).decode("utf-8")
|
||
|
||
# 将指定影像件大小单位由MB转为B
|
||
image_size_specified = kwargs.get("image_size_specified", 2) * 1048576
|
||
|
||
# 若影像件BASE64编码后大小小于指定影像件大小则返回
|
||
if len(image_base64) <= image_size_specified:
|
||
return image_guid, image_base64
|
||
|
||
# 双循环压缩影像件
|
||
# 外循环压缩:通过降低影像件质量实现压缩影像件大小
|
||
for quality in range(90, 0, -10):
|
||
|
||
image_copy = image.copy()
|
||
|
||
# 内循环压缩:通过等比例调整影像件尺寸实现压缩影像件大小
|
||
for i in range(25):
|
||
|
||
success, image_bytes = cv2.imencode(
|
||
ext=f".{image_format}",
|
||
img=image_copy,
|
||
params=(
|
||
[cv2.IMWRITE_PNG_COMPRESSION, 10 - quality // 10]
|
||
if image_format == "png"
|
||
else [cv2.IMWRITE_JPEG_QUALITY, quality]
|
||
),
|
||
)
|
||
if not success or image_bytes is None:
|
||
break
|
||
|
||
image_base64 = b64encode(image_bytes.tobytes()).decode("utf-8")
|
||
if len(image_base64) <= image_size_specified:
|
||
return image_guid, image_base64
|
||
|
||
# 调整影像件尺寸
|
||
image_copy = cv2.resize(
|
||
src=image_copy,
|
||
dsize=(int(image_copy.shape[0] * 0.9), int(image_copy.shape[1] * 0.9)),
|
||
interpolation=cv2.INTER_AREA,
|
||
)
|
||
|
||
# 若调整后影像件尺寸中长或宽小于200像素则停止调整影像件尺寸
|
||
if min(image_copy.shape[:2]) < 200:
|
||
break
|
||
|
||
# 若仍未压缩至指定影像件大小则返回NONE
|
||
return None, None
|
||
|
||
|
||
def images_classification(**kwargs) -> tuple[str | None, str | None]:
|
||
"""影像件分类"""
|
||
|
||
# 影像件全局唯一标识:优先使用关键词变量,其次使用全局变量,再次使用随机唯一标识
|
||
image_guid = kwargs.get(
|
||
"image_guid", globals().get("image_guid", uuid.uuid4().hex.upper())
|
||
)
|
||
|
||
# 影像件格式
|
||
image_format = kwargs.get("image_format", globals()["image_format"])
|
||
if image_format is None:
|
||
return None, None
|
||
|
||
# 影像件BASE64编码
|
||
image_base64 = kwargs.get("image_base64", globals()["image_base64"])
|
||
if image_base64 is None:
|
||
return None, None
|
||
|
||
# 请求深圳快瞳影像件分类接口
|
||
response = globals()["http_client"].post(
|
||
url=(url := "https://ai.inspirvision.cn/s/api/ocr/genalClassify"),
|
||
# 用于和深圳快瞳联查定位
|
||
headers={"X-RequestId-Header": image_guid},
|
||
data={
|
||
"token": globals()["authenticator"].get_token(
|
||
servicer="szkt"
|
||
), # 使用全局变量
|
||
"imgBase64": f"data:image/{image_format};base64,{image_base64}", # 深圳快瞳要求修饰影像件BASE64编码的DATAURI
|
||
},
|
||
guid=hashlib.md5((url + image_guid).encode("utf-8")).hexdigest().upper(),
|
||
)
|
||
|
||
# 若响应非成功,则返回NONE
|
||
if not (response.get("status") == 200 and response.get("code") == 0):
|
||
return None, None
|
||
|
||
# 根据票据类型和票据二级分类映射影像件类型
|
||
match (response["data"]["flag"], response["data"]["type"]):
|
||
case (7, "idcard-front-back"):
|
||
image_type = "居民身份证(正背面)"
|
||
case (7, "idcard-front"):
|
||
image_type = "居民身份证(正面)" # 包含国徽一面
|
||
case (7, "idcard-back"):
|
||
image_type = "居民身份证(背面)" # 包含头像一面
|
||
case (8, _):
|
||
image_type = "银行卡"
|
||
case (4, _):
|
||
image_type = "增值税发票"
|
||
case (5, _):
|
||
image_type = "门诊收费票据"
|
||
case (3, _):
|
||
image_type = "住院收费票据"
|
||
case (18, _):
|
||
image_type = "理赔申请书"
|
||
case _:
|
||
return None, None
|
||
|
||
# 影像件方向
|
||
image_orientation = {
|
||
"0": "0度",
|
||
"90": "顺时针90度",
|
||
"180": "180度",
|
||
"270": "逆时针90度",
|
||
}.get(response["data"]["angle"], "0度")
|
||
|
||
return image_type, image_orientation
|
||
|
||
|
||
def idcard_extraction(**kwargs) -> dict | None:
|
||
"""居民身份证数据提取"""
|
||
|
||
# 影像件全局唯一标识:优先使用关键词变量,其次使用全局变量,再次使用随机唯一标识
|
||
image_guid = kwargs.get(
|
||
"image_guid", globals().get("image_guid", uuid.uuid4().hex.upper())
|
||
)
|
||
|
||
# 影像件格式
|
||
image_format = kwargs.get("image_format", globals()["image_format"])
|
||
if image_format is None:
|
||
return None
|
||
|
||
# 影像件BASE64编码
|
||
image_base64 = kwargs.get("image_base64", globals()["image_base64"])
|
||
if image_base64 is None:
|
||
return None
|
||
|
||
# 请求深圳快瞳居民身份证识别接口
|
||
response = globals()["http_client"].post(
|
||
url=(url := "https://ai.inspirvision.cn/s/api/ocr/identityCard"),
|
||
headers={"X-RequestId-Header": image_guid},
|
||
data={
|
||
"token": globals()["authenticator"].get_token(
|
||
servicer="szkt"
|
||
), # 使用全局变量
|
||
"imgBase64": f"data:image/{image_format};base64,{image_base64}",
|
||
}, # 支持同时识别居民身份证正反面
|
||
guid=hashlib.md5((url + image_guid).encode("utf-8")).hexdigest().upper(),
|
||
)
|
||
|
||
# 若响应非成功,则返回NONE
|
||
if not (response.get("status") == 200 and response.get("code") == 0):
|
||
return None
|
||
|
||
extraction = {
|
||
"姓名": response["data"]["name"],
|
||
"性别": response["data"]["sex"],
|
||
"民族": response["data"]["nation"],
|
||
"出生": response["data"][
|
||
"birthday"
|
||
], # 深圳快瞳居民身份证出生日期格式为%Y-%m-%d
|
||
"住址": response["data"]["address"],
|
||
"公民身份号码": response["data"]["idNo"],
|
||
"签发机关": response["data"]["issuedBy"],
|
||
"有效期起": parse(
|
||
(date := response["data"]["validDate"]).split("-")[0]
|
||
).strftime(
|
||
"%Y-%m-%d"
|
||
), # 深圳快瞳居民身份证识别中有效期日期格式为%Y.%m.%d,转为%Y-%m-%d
|
||
"有效期止": (
|
||
date
|
||
if (date := date.split("-")[1]) == "长期"
|
||
else parse(date).strftime("%Y-%m-%d")
|
||
),
|
||
}
|
||
|
||
return extraction
|
||
|
||
|
||
def bankcard_extraction(**kwargs) -> dict | None:
|
||
"""银行卡数据提取"""
|
||
|
||
# 影像件全局唯一标识:优先使用关键词变量,其次使用全局变量,再次使用随机唯一标识
|
||
image_guid = kwargs.get(
|
||
"image_guid", globals().get("image_guid", uuid.uuid4().hex.upper())
|
||
)
|
||
|
||
# 影像件格式
|
||
image_format = kwargs.get("image_format", globals()["image_format"])
|
||
if image_format is None:
|
||
raise RuntimeError("请入参:image_format")
|
||
|
||
# 影像件BASE64编码
|
||
image_base64 = kwargs.get("image_base64", globals()["image_base64"])
|
||
if image_base64 is None:
|
||
raise RuntimeError("请入参:image_base64")
|
||
|
||
# 请求深圳快瞳银行卡识别接口
|
||
response = globals()["http_client"].post(
|
||
url=(url := "https://ai.inspirvision.cn/s/api/ocr/bankCard"),
|
||
headers={"X-RequestId-Header": image_guid},
|
||
data={
|
||
"token": globals()["authenticator"].get_token(servicer="szkt"),
|
||
"imgBase64": f"data:image/{image_format};base64,{image_base64}",
|
||
},
|
||
guid=hashlib.md5((url + image_guid).encode("utf-8")).hexdigest().upper(),
|
||
)
|
||
|
||
# 若响应非成功,则返回NONE
|
||
if not (response.get("status") == 200 and response.get("code") == 0):
|
||
return None
|
||
|
||
extraction = {
|
||
"卡类型": {"1": "借记卡", "2": "贷记卡"}.get(
|
||
response["data"]["bankCardType"], "其它"
|
||
), # 0不能识别、3准贷记卡、4预付卡合并为其它
|
||
"银行名称": response["data"]["bankInfo"],
|
||
"卡号": response["data"]["cardNo"].replace(" ", ""),
|
||
}
|
||
|
||
return extraction
|
||
|
||
|
||
def invoice_extraction(**kwargs) -> dict | None:
|
||
"""增值税发票/收费票据数据提取"""
|
||
|
||
# 影像件全局唯一标识:优先使用关键词变量,其次使用全局变量,再次使用随机唯一标识
|
||
image_guid = kwargs.get(
|
||
"image_guid", globals().get("image_guid", uuid.uuid4().hex.upper())
|
||
)
|
||
|
||
# 影像件格式
|
||
image_format = kwargs.get("image_format", globals()["image_format"])
|
||
if image_format is None:
|
||
return None
|
||
|
||
# 影像件BASE64编码
|
||
image_base64 = kwargs.get("image_base64", globals()["image_base64"])
|
||
if image_base64 is None:
|
||
return None
|
||
|
||
try:
|
||
# 请求深圳快瞳票据查验接口(兼容增值税发票、医疗门诊/住院收费票据)
|
||
response = globals()["http_client"].post(
|
||
url=(url := "https://ai.inspirvision.cn/s/api/ocr/invoiceCheckAll"),
|
||
headers={"X-RequestId-Header": image_guid},
|
||
data={
|
||
"token": globals()["authenticator"].get_token(servicer="szkt"),
|
||
"imgBase64": f"data:image/{image_format};base64,{image_base64}",
|
||
},
|
||
guid=hashlib.md5((url + image_guid).encode("utf-8")).hexdigest().upper(),
|
||
)
|
||
if not (response.get("status") == 200 and response.get("code") == 10000):
|
||
raise RuntimeError("深圳快瞳票据查验发生异常")
|
||
|
||
match response["data"]["productCode"]:
|
||
case "003082": # 增值税发票
|
||
extraction = {
|
||
"票据类型": {
|
||
"10108": "数电票",
|
||
"10101": "增值税普通发票",
|
||
"10100": "增值税专用发票",
|
||
"30100": "数电票",
|
||
"30101": "数电票",
|
||
"30104": "增值税专用发票",
|
||
"30105": "数电票",
|
||
"10106": "区块链电子发票",
|
||
"30109": "数电票",
|
||
"30121": "增值税普通发票",
|
||
"10102": "增值税普通发票",
|
||
"10103": "增值税普通发票",
|
||
"10107": "数电票",
|
||
}.get(response["data"]["type"], "其它增值税发票"),
|
||
"票据号码": response["data"]["details"]["number"],
|
||
"票据代码": (
|
||
code if (code := response["data"]["details"]["code"]) else None
|
||
), # 深圳快瞳票据查验中数电票票据代码为空字符,转为NONE
|
||
"开票日期": datetime.strptime(
|
||
response["data"]["details"]["date"], "%Y年%m月%d日"
|
||
).strftime(
|
||
"%Y-%m-%d"
|
||
), # 深圳快瞳票据查验中就增值税发票开票日期格式为%Y年%m月%d日,转为%Y-%m-%d
|
||
"校验码": response["data"]["details"]["check_code"],
|
||
"收款方": response["data"]["details"]["seller"],
|
||
"付款方": response["data"]["details"]["buyer"],
|
||
"票据金额": format(
|
||
Decimal(response["data"]["details"]["total"]).quantize(
|
||
Decimal("0.00"),
|
||
rounding=ROUND_HALF_UP,
|
||
),
|
||
".2f",
|
||
),
|
||
"查验状态": (
|
||
"真票"
|
||
if response["data"]["details"]["invoiceTypeNo"] == "0"
|
||
else "红票"
|
||
),
|
||
"备注": (
|
||
remark
|
||
if (remark := response["data"]["details"]["remark"])
|
||
else None
|
||
), # 深圳快瞳票据查验中增值税发票备注可能为空字符,转为NONE
|
||
"项目": [
|
||
{
|
||
"名称": item["name"],
|
||
"规格": (
|
||
specification
|
||
if (specification := item["specification"])
|
||
else None
|
||
),
|
||
"单位": unit if (unit := item["unit"]) else None,
|
||
"数量": (
|
||
format(
|
||
Decimal(quantity).quantize(
|
||
Decimal("0.00"),
|
||
rounding=ROUND_HALF_UP,
|
||
),
|
||
".2f",
|
||
)
|
||
if (quantity := item["quantity"])
|
||
else None
|
||
),
|
||
"金额": format(
|
||
(
|
||
Decimal(item["total"]) + Decimal(item["tax"])
|
||
).quantize(
|
||
Decimal("0.00"),
|
||
rounding=ROUND_HALF_UP,
|
||
),
|
||
".2f",
|
||
), # 价税合计
|
||
}
|
||
for item in response["data"]["details"].get("items", [])
|
||
],
|
||
} # 深圳快瞳票据查验中就部分增值税发票仅可查,数据标准化抛出异常
|
||
return extraction
|
||
|
||
case "003081": # 门诊/住院收费票据
|
||
extraction = {
|
||
"票据类型": (
|
||
"门诊收费票据"
|
||
if "门诊" in response["data"]["billName"]
|
||
else "住院收费票据"
|
||
),
|
||
"票据号码": response["data"]["billNumber"],
|
||
"票据代码": response["data"]["billCode"],
|
||
"开票日期": response["data"][
|
||
"invoiceDate"
|
||
], # 深圳快瞳票据查验中就收费票据开票日期格式为%Y-%m-%d
|
||
"校验码": response["data"]["checkCode"],
|
||
"收款方": response["data"]["payeeName"],
|
||
"付款方": response["data"]["payer"],
|
||
"票据金额": format(
|
||
Decimal(response["data"]["amount"]).quantize(
|
||
Decimal("0.00"),
|
||
rounding=ROUND_HALF_UP,
|
||
),
|
||
".2f",
|
||
),
|
||
"查验状态": {"true": "真票", "false": "红票"}[
|
||
response["data"]["flushedRed"]
|
||
],
|
||
"备注": response["data"].get("remark"),
|
||
"医保支付": format(
|
||
Decimal(response["data"].get("medicarePay", "0.00")).quantize(
|
||
Decimal("0.00"),
|
||
rounding=ROUND_HALF_UP,
|
||
),
|
||
".2f",
|
||
),
|
||
"其它支付": format(
|
||
Decimal(response["data"].get("otherPayment", "0.00")).quantize(
|
||
Decimal("0.00"),
|
||
rounding=ROUND_HALF_UP,
|
||
),
|
||
".2f",
|
||
),
|
||
"个人自付": format(
|
||
Decimal(response["data"].get("personalPay", "0.00")).quantize(
|
||
Decimal("0.00"),
|
||
rounding=ROUND_HALF_UP,
|
||
),
|
||
".2f",
|
||
),
|
||
"自付一": format(
|
||
Decimal(response["data"].get("self_pay_one", "0.00")).quantize(
|
||
Decimal("0.00"),
|
||
rounding=ROUND_HALF_UP,
|
||
),
|
||
".2f",
|
||
), # 深圳快瞳票据查验中就部分地区无自付一
|
||
"自付二": format(
|
||
Decimal(
|
||
response["data"].get("classificationPays", "0.00")
|
||
).quantize(
|
||
Decimal("0.00"),
|
||
rounding=ROUND_HALF_UP,
|
||
),
|
||
".2f",
|
||
), # 深圳快瞳票据查验中就部分地区无自付二
|
||
"个人自费": format(
|
||
Decimal(
|
||
response["data"].get("personalExpense", "0.00")
|
||
).quantize(
|
||
Decimal("0.00"),
|
||
rounding=ROUND_HALF_UP,
|
||
),
|
||
".2f",
|
||
),
|
||
"住院日期": (
|
||
parse(date.split("-")[0]).strftime("%Y-%m-%d")
|
||
if (date := response["data"].get("hospitalizationDate"))
|
||
else None
|
||
), # 深圳快瞳票据查验中就收费票据住院日期格式为%Y%m%d-%Y%m%d,即住院日期-出院日期
|
||
"出院日期": (
|
||
parse(date.split("-")[1]).strftime("%Y-%m-%d") if date else None
|
||
),
|
||
"医疗机构类型": response["data"]["institutionsType"],
|
||
"项目": [
|
||
{
|
||
"名称": item["itemName"],
|
||
"规格": item[
|
||
"medical_level"
|
||
], # 甲类无自付、乙类有自付、丙类全自付
|
||
"单位": item["unit"],
|
||
"数量": format(
|
||
Decimal(item["number"]).quantize(
|
||
Decimal("0.00"),
|
||
rounding=ROUND_HALF_UP,
|
||
),
|
||
".2f",
|
||
),
|
||
"金额": format(
|
||
Decimal(item["totalAmount"]).quantize(
|
||
Decimal("0.00"),
|
||
rounding=ROUND_HALF_UP,
|
||
),
|
||
".2f",
|
||
),
|
||
}
|
||
for item in response["data"]["feedetails"]
|
||
],
|
||
}
|
||
return extraction
|
||
# 若请求深圳快瞳票据查验接口或解析发生异常,则根据影像件类型请求深圳快瞳增值税发票/收费票据识别接口
|
||
except:
|
||
# 影像件类型
|
||
image_type = kwargs.get("image_type", globals()["image_type"])
|
||
if image_type is None:
|
||
return None
|
||
|
||
match image_type:
|
||
case "增值税发票":
|
||
try:
|
||
# 请求深圳快瞳增值税发票识别接口
|
||
response = globals()["http_client"].post(
|
||
url=(url := "https://ai.inspirvision.cn/s/api/ocr/vatInvoice"),
|
||
headers={"X-RequestId-Header": image_guid},
|
||
data={
|
||
"token": globals()["authenticator"].get_token(
|
||
servicer="szkt"
|
||
),
|
||
"imgBase64": f"data:image/{image_format};base64,{image_base64}",
|
||
},
|
||
guid=hashlib.md5((url + image_guid).encode("utf-8"))
|
||
.hexdigest()
|
||
.upper(),
|
||
)
|
||
# 若深圳快瞳增值税发票识别响应非成功则返回NONE
|
||
if not (
|
||
response.get("status") == 200 and response.get("code") == 0
|
||
):
|
||
return None
|
||
|
||
extraction = {
|
||
"票据类型": (
|
||
invoice_type := (
|
||
data := {
|
||
item["desc"]: item["value"]
|
||
for item in response["data"]
|
||
}
|
||
).get("发票类型")
|
||
),
|
||
"票据号码": (number := data.get("发票号码")),
|
||
"票据代码": data.get("发票代码"),
|
||
"开票日期": (
|
||
datetime.strptime(date, "%Y年%m月%d日").strftime("%Y-%m-%d")
|
||
if re.match(
|
||
r"\d{4}年\d{1,2}月\d{1,2}日",
|
||
(date := data.get("开票日期")),
|
||
)
|
||
else date
|
||
),
|
||
"校验码": (
|
||
check_code if (check_code := data.get("校验码")) else number
|
||
), # 若校验码为空则默认为票据号码
|
||
"收款方": data.get("销售方名称"),
|
||
"付款方": data.get("购买方名称"),
|
||
"票据金额": format(
|
||
Decimal(
|
||
data.get("小写金额").replace("¥", "")
|
||
if invoice_type == "电子发票(普通发票)"
|
||
else data.get("合计金额(小写)")
|
||
).quantize(
|
||
Decimal("0.00"),
|
||
rounding=ROUND_HALF_UP,
|
||
),
|
||
".2f",
|
||
),
|
||
"备注": remark if (remark := data.get("备注")) else None,
|
||
"项目": (
|
||
[
|
||
{
|
||
"名称": name,
|
||
"规格": specification if specification else None,
|
||
"单位": unit if unit else None,
|
||
"数量": (
|
||
format(
|
||
Decimal(quantity).quantize(
|
||
Decimal("0.00"),
|
||
rounding=ROUND_HALF_UP,
|
||
),
|
||
".2f",
|
||
)
|
||
if quantity
|
||
else None
|
||
),
|
||
"金额": format(
|
||
(Decimal(amount) + Decimal(tax)).quantize(
|
||
Decimal("0.00"),
|
||
rounding=ROUND_HALF_UP,
|
||
),
|
||
".2f", # 价税合计,保留两位小数
|
||
),
|
||
}
|
||
for name, specification, unit, quantity, amount, tax in zip(
|
||
[
|
||
component["value"]
|
||
for component in response["data"]
|
||
if re.match(
|
||
r"^项目名称(\d+)?$",
|
||
component["desc"],
|
||
)
|
||
],
|
||
[
|
||
component["value"]
|
||
for component in response["data"]
|
||
if re.match(
|
||
r"^规格型号(\d+)?$",
|
||
component["desc"],
|
||
)
|
||
],
|
||
[
|
||
component["value"]
|
||
for component in response["data"]
|
||
if re.match(
|
||
r"^单位(\d+)?$",
|
||
component["desc"],
|
||
)
|
||
],
|
||
[
|
||
component["value"]
|
||
for component in response["data"]
|
||
if re.match(
|
||
r"^数量(\d+)?$",
|
||
component["desc"],
|
||
)
|
||
],
|
||
[
|
||
component["value"]
|
||
for component in response["data"]
|
||
if re.match(
|
||
r"^金额(\d+)?$",
|
||
component["desc"],
|
||
)
|
||
],
|
||
[
|
||
component["value"]
|
||
for component in response["data"]
|
||
if re.match(
|
||
r"^税额(\d+)?$",
|
||
component["desc"],
|
||
)
|
||
],
|
||
)
|
||
]
|
||
if invoice_type == "电子发票(普通发票)"
|
||
else [
|
||
{
|
||
"名称": name,
|
||
"数量": format(
|
||
Decimal(quantity).quantize(
|
||
Decimal("0.00"),
|
||
rounding=ROUND_HALF_UP,
|
||
),
|
||
"0.2f",
|
||
),
|
||
"金额": format(
|
||
Decimal(amount).quantize(
|
||
Decimal("0.00"),
|
||
rounding=ROUND_HALF_UP,
|
||
),
|
||
".2f",
|
||
),
|
||
}
|
||
for name, quantity, amount in zip(
|
||
[
|
||
component["value"]
|
||
for component in response["data"]
|
||
if re.match(
|
||
r"^项目名称明细(\d+)?$",
|
||
component["desc"],
|
||
)
|
||
],
|
||
[
|
||
component["value"]
|
||
for component in response["data"]
|
||
if re.match(
|
||
r"^项目数量明细(\d+)?$",
|
||
component["desc"],
|
||
)
|
||
],
|
||
[
|
||
component["value"]
|
||
for component in response["data"]
|
||
if re.match(
|
||
r"^项目金额明细(\d+)?$",
|
||
component["desc"],
|
||
)
|
||
],
|
||
)
|
||
]
|
||
),
|
||
"查验状态": "无法查验",
|
||
}
|
||
return extraction
|
||
except:
|
||
return None
|
||
|
||
case _:
|
||
try:
|
||
# 请求深圳快瞳收费票据识别接口
|
||
response = globals()["http_client"].post(
|
||
url=(url := "https://ai.inspirvision.cn/s/api/ocr/medical"),
|
||
headers={"X-RequestId-Header": image_guid},
|
||
data={
|
||
"token": globals()["authenticator"].get_token(
|
||
servicer="szkt"
|
||
),
|
||
"imgBase64": f"data:image/{image_format};base64,{image_base64}",
|
||
},
|
||
guid=hashlib.md5((url + image_guid).encode("utf-8"))
|
||
.hexdigest()
|
||
.upper(),
|
||
)
|
||
# 若深圳快瞳收费票据识别响应非成功则返回NONE
|
||
if not (
|
||
response.get("status") == 200 and response.get("code") == 0
|
||
):
|
||
return None
|
||
|
||
extraction = {
|
||
"票据类型": (
|
||
"门诊收费票据"
|
||
if response["data"]["insured"]["receipt_outpatient"]
|
||
else "住院收费票据"
|
||
),
|
||
"票据号码": (
|
||
receipt := (
|
||
response["data"]["insured"]["receipt_outpatient"]
|
||
or response["data"]["insured"][
|
||
"receipt_hospitalization"
|
||
]
|
||
)["receipts"][0]
|
||
)["receipt_no"][
|
||
"value"
|
||
], # 默认提取门诊/住院收费票据的第一张票据
|
||
"票据代码": receipt["global_detail"]["invoice_code"]["value"],
|
||
"开票日期": receipt["global_detail"]["invoice_date"][
|
||
"value"
|
||
], # 深圳快瞳收费票据识别中就开票日期格式为%Y-%m-%d
|
||
"校验码": fuzzy_match(
|
||
target="校验码",
|
||
components=receipt["global_detail"]["region_specific"],
|
||
specify_key="name",
|
||
return_key="word.value",
|
||
),
|
||
"收款方": receipt["hospital_name"]["value"],
|
||
"付款方": receipt["name"]["value"],
|
||
"票据金额": format(
|
||
Decimal(receipt["total_amount"]["value"]).quantize(
|
||
Decimal("0.00"),
|
||
rounding=ROUND_HALF_UP,
|
||
),
|
||
".2f",
|
||
),
|
||
"医保支付": (
|
||
format(
|
||
Decimal(field["value"]).quantize(
|
||
Decimal("0.00"),
|
||
rounding=ROUND_HALF_UP,
|
||
),
|
||
".2f",
|
||
)
|
||
if isinstance((field := receipt.get("medicare_pay")), dict)
|
||
else None
|
||
),
|
||
"其它支付": format(
|
||
(
|
||
Decimal(value).quantize(
|
||
Decimal("0.00"),
|
||
rounding=ROUND_HALF_UP,
|
||
)
|
||
if (
|
||
value := fuzzy_match(
|
||
target="其它支付",
|
||
components=receipt.get("global_detail", {}).get(
|
||
"pay_list", []
|
||
),
|
||
specify_key="name",
|
||
return_key="word.value",
|
||
)
|
||
)
|
||
else None
|
||
),
|
||
".2f",
|
||
),
|
||
"个人自付": (
|
||
format(
|
||
Decimal(field["value"]).quantize(
|
||
Decimal("0.00"),
|
||
rounding=ROUND_HALF_UP,
|
||
),
|
||
".2f",
|
||
)
|
||
if isinstance((field := receipt.get("self_pay")), dict)
|
||
else None
|
||
),
|
||
"自付一": (
|
||
format(
|
||
Decimal(field["value"]).quantize(
|
||
Decimal("0.00"),
|
||
rounding=ROUND_HALF_UP,
|
||
),
|
||
".2f",
|
||
)
|
||
if isinstance(field := (receipt.get("self_pay_one")), dict)
|
||
else None
|
||
),
|
||
"自付二": (
|
||
format(
|
||
Decimal(field["value"]).quantize(
|
||
Decimal("0.00"),
|
||
rounding=ROUND_HALF_UP,
|
||
),
|
||
".2f",
|
||
)
|
||
if isinstance(field := (receipt.get("self_pay_two")), dict)
|
||
else None
|
||
),
|
||
"个人自费": (
|
||
format(
|
||
Decimal(field["value"]).quantize(
|
||
Decimal("0.00"),
|
||
rounding=ROUND_HALF_UP,
|
||
),
|
||
".2f",
|
||
)
|
||
if isinstance(field := (receipt.get("self_cost")), dict)
|
||
else None
|
||
),
|
||
"住院日期": (
|
||
datetime.strptime(field["value"], "%Y%m%d").strftime(
|
||
"%Y-%m-%d"
|
||
)
|
||
if isinstance(field := (receipt.get("starttime")), dict)
|
||
else None
|
||
),
|
||
"出院日期": (
|
||
datetime.strptime(field["value"], "%Y%m%d").strftime(
|
||
"%Y-%m-%d"
|
||
)
|
||
if isinstance(field := (receipt.get("endtime")), dict)
|
||
else None
|
||
),
|
||
"医疗机构类型": receipt["others"]["medical_institution_type"][
|
||
"value"
|
||
],
|
||
"项目": [
|
||
{
|
||
"名称": (
|
||
field["value"]
|
||
if isinstance((field := item["item_name"]), dict)
|
||
else None
|
||
),
|
||
"规格": (
|
||
field["value"]
|
||
if isinstance(
|
||
(field := item["specifications"]), dict
|
||
)
|
||
else None
|
||
),
|
||
"单位": (
|
||
field["value"]
|
||
if isinstance((field := item["unit"]), dict)
|
||
else None
|
||
),
|
||
"数量": (
|
||
format(
|
||
Decimal(field["value"]).quantize(
|
||
Decimal("0.00"),
|
||
rounding=ROUND_HALF_UP,
|
||
),
|
||
".2f",
|
||
)
|
||
if isinstance((field := item["number"]), dict)
|
||
else None
|
||
),
|
||
"金额": (
|
||
format(
|
||
Decimal(field["value"]).quantize(
|
||
Decimal("0.00"),
|
||
rounding=ROUND_HALF_UP,
|
||
),
|
||
".2f",
|
||
)
|
||
if isinstance((field := item["total_amount"]), dict)
|
||
else None
|
||
),
|
||
}
|
||
for item in receipt["feeitems"]
|
||
],
|
||
"查验状态": "无法查验",
|
||
}
|
||
return extraction
|
||
except:
|
||
return None
|
||
|
||
|
||
def common_extraction(**kwargs) -> dict | None:
|
||
"""通用数据提取"""
|
||
|
||
# 影像件全局唯一标识:优先使用关键词变量,其次使用全局变量,再次使用随机唯一标识
|
||
image_guid = kwargs.get(
|
||
"image_guid", globals().get("image_guid", uuid.uuid4().hex.upper())
|
||
)
|
||
|
||
# 影像件格式
|
||
image_format = kwargs.get("image_format", globals()["image_format"])
|
||
if image_format is None:
|
||
raise RuntimeError("请入参:image_format")
|
||
|
||
# 影像件BASE64编码
|
||
image_base64 = kwargs.get("image_base64", globals()["image_base64"])
|
||
if image_base64 is None:
|
||
raise RuntimeError("请入参:image_base64")
|
||
|
||
# 请求深圳快瞳通用文本识别接口
|
||
response = globals()["http_client"].post(
|
||
url=(url := "https://ai.inspirvision.cn/s/api/ocr/general"),
|
||
headers={"X-RequestId-Header": image_guid},
|
||
data={
|
||
"token": globals()["authenticator"].get_token(servicer="szkt"),
|
||
"imgBase64": f"data:image/{image_format};base64,{image_base64}",
|
||
},
|
||
guid=hashlib.md5((url + image_guid).encode("utf-8")).hexdigest().upper(),
|
||
)
|
||
# 若响应非成功,则返回NONE
|
||
if not (response.get("status") == 200 and response.get("code") == 0):
|
||
return None
|
||
|
||
# 基于空间坐标法就识别结果中文本框进行分行排序
|
||
texts = []
|
||
# 重构文本框列表
|
||
for text in response["data"]:
|
||
texts.append(
|
||
[
|
||
# 文本框左上角的X坐标
|
||
numpy.float64(text["itemPolygon"]["x"]),
|
||
# 文本框左上角的Y坐标
|
||
numpy.float64(text["itemPolygon"]["y"]),
|
||
# 文本框的高度
|
||
numpy.float64(
|
||
text["itemPolygon"]["height"]
|
||
), # 深圳快瞳基于文本框的Y坐标最大值和最小值的差值
|
||
text["value"],
|
||
]
|
||
)
|
||
# 按照文本框Y坐标升序(使用空间坐标算法,从上到下,从左到右)
|
||
texts.sort(key=lambda x: x[1])
|
||
rows = []
|
||
# 分行
|
||
for index, text in enumerate(texts[1:]):
|
||
# 若为第一行则初始化当前行
|
||
if index == 0:
|
||
row = [texts[0]]
|
||
continue
|
||
# 若文本框的Y坐标与当前行中最后一个文本框的Y坐标差值小于阈值,则归为同一行
|
||
# noinspection PyUnboundLocalVariable
|
||
# noinspection PyTypeChecker
|
||
if text[1] - row[-1][1] < numpy.mean([x[2] for x in row]) * 0.5:
|
||
row.append(text)
|
||
# 否则结束当前行、初始化当前行
|
||
else:
|
||
rows.append(row)
|
||
row = [text]
|
||
# 添加最后一行
|
||
rows.append(row)
|
||
extraction = []
|
||
# 按照文本框X坐标升序
|
||
for row in rows:
|
||
extraction.extend(
|
||
[x[3].replace(" ", "") for x in sorted(row, key=lambda x: x[0])]
|
||
)
|
||
# 以空格拼接
|
||
extraction = " ".join(extraction)
|
||
|
||
# 根据理赔申请书匹配提示词
|
||
match application_form := kwargs.get(
|
||
"application_form", globals().get("application_form")
|
||
):
|
||
case "中行员工福利保障计划索赔申请书":
|
||
prompt = f"""
|
||
指令:你是一个从OCR文本中智能提取信息并生成JSON的工具,请严格按照要求执行。
|
||
|
||
输入:OCR文本(可能包含错漏):
|
||
{extraction}
|
||
|
||
输出要求:
|
||
1、只输出可被Python中json.loads()解析的JSON格式字符串,不包含任何代码块标记、说明文字等其它非JSON格式内容
|
||
2、无法确定的值设置为`null`(不是"null"字符串)
|
||
|
||
JSON结构:
|
||
{{
|
||
"基础信息": {{
|
||
"申请人": "字符串或null",
|
||
"性别": "字符串或null",
|
||
"年龄": "字符串或null",
|
||
"手机": "字符串或null",
|
||
"身份证号": "字符串或null",
|
||
"开户银行": "字符串或null",
|
||
"户名": "字符串或null",
|
||
"账号": "字符串或null",
|
||
}},
|
||
"票据表格": [
|
||
{{
|
||
"就诊序号": "字符串或null",
|
||
"发票日期": "YYYY-MM-DD或null",
|
||
"发票上的就诊医院/药店": "字符串或null",
|
||
"票据张数": "字符串或null",
|
||
"票据金额": "字符串或null",
|
||
"诊断": "字符串或null"
|
||
}},
|
||
]
|
||
}}
|
||
|
||
开始输出:
|
||
"""
|
||
case _:
|
||
raise RuntimeError(f"理赔申请书{application_form}未设置处理方法")
|
||
|
||
# 请求大语言模型创建对话接口
|
||
response = globals()["http_client"].post(
|
||
url="https://api.siliconflow.cn/v1/chat/completions",
|
||
headers={
|
||
"Authorization": "Bearer sk-xsnuwirjjphhfdbvznfdfjqlinfdlrnlxuhkbbqynfnbhiqz", # 基于硅基流动
|
||
"Content-Type": "application/json; charset=utf-8",
|
||
},
|
||
json={
|
||
"model": "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B", # 通过从DeepSeek-R1-0528模型蒸馏思维链接至Qwen3-8B-Base获得的模型
|
||
"messages": [{"role": "user", "content": prompt}],
|
||
"max_tokens": 10240, # 生成文本最大令牌数
|
||
"temperature": 0.2,
|
||
"top_p": 0.5,
|
||
"top_k": 20,
|
||
"frequency_penalty": 0.0,
|
||
"thinking_budget": 1,
|
||
},
|
||
guid=hashlib.md5(prompt.encode("utf-8")).hexdigest().upper(),
|
||
)
|
||
|
||
extraction = (
|
||
json.loads(match.group("json"))
|
||
if (
|
||
match := re.search(
|
||
r"```json\s*(?P<json>\{.*})\s*```",
|
||
response["choices"][0]["message"]["content"],
|
||
re.DOTALL,
|
||
)
|
||
)
|
||
else None
|
||
)
|
||
|
||
return extraction
|
||
|
||
|
||
# 规则模型初始化
|
||
def decision(rules_path: Path) -> ZenDecision:
|
||
def loader(path):
|
||
with open(path, "r") as file:
|
||
return file.read()
|
||
|
||
return ZenEngine({"loader": loader}).get_decision(rules_path.as_posix())
|
||
|
||
|
||
def disease_diagnosis(**kwargs) -> str | None:
|
||
"""疾病推定"""
|
||
|
||
# 赔案档案:优先使用关键词变量,其次使用全局变量
|
||
dossier = kwargs.get("dossier", globals().get("dossier"))
|
||
|
||
prompt = f"""
|
||
指令:你是一个医学疾病分类诊断的工具,请严格按照要求执行。
|
||
|
||
患者信息:
|
||
性别 {gender if (gender := dossier["赔案层"]["申请人信息"].get("性别")) is not None else "未知"},
|
||
年龄 {age if (age := dossier["赔案层"]["申请人信息"].get("年龄")) is not None else "未知"},
|
||
近期在药房/医院开具发票中内容 {dossier["赔案层"]["其它信息"]["小项合集"]}
|
||
|
||
输出要求:
|
||
1、患者自述症状在 {dossier["赔案层"]["其它信息"]["自述症状"]} 其中之一
|
||
2、依据患者信息、自述症状和其提供的发票中内容 {kwargs["items"]} 综合诊断,只输出一个最可能的ICD-11中的疾病分类中亚类目代码对应的中文名称字符串,不包含任何代码块标记、说明文字等
|
||
|
||
开始输出:
|
||
"""
|
||
|
||
# 请求大语言模型创建对话接口
|
||
response = globals()["http_client"].post(
|
||
url="https://ark.cn-beijing.volces.com/api/v3/chat/completions",
|
||
headers={
|
||
"Authorization": "Bearer 2c28ab07-888c-45be-84a2-fc4b2cb5f3f2", # 火山引擎
|
||
"Content-Type": "application/json; charset=utf-8",
|
||
},
|
||
json={
|
||
"model": "deepseek-r1-250528",
|
||
"messages": [
|
||
{"role": "system", "content": "你是人工智能助手"},
|
||
{"role": "user", "content": prompt},
|
||
],
|
||
"temperature": 0.2,
|
||
"top_p": 0.5,
|
||
"top_k": 20,
|
||
"frequency_penalty": 0.0,
|
||
"thinking_budget": 1,
|
||
},
|
||
guid=hashlib.md5(prompt.encode("utf-8")).hexdigest().upper(),
|
||
)
|
||
|
||
recognition = (
|
||
match.group("text")
|
||
if (
|
||
match := re.match(
|
||
r"\s*(?P<text>.*)", response["choices"][0]["message"]["content"]
|
||
)
|
||
)
|
||
else None
|
||
)
|
||
|
||
return recognition
|
||
|
||
|
||
# -------------------------
|
||
# 主程序
|
||
# -------------------------
|
||
|
||
|
||
if __name__ == "__main__":
|
||
|
||
# 初始化HTTP客户端
|
||
http_client = HTTPClient(timeout=300, cache_enabled=True)
|
||
|
||
# 初始化认证器
|
||
authenticator = Authenticator()
|
||
|
||
# 初始化工作目录地址对象
|
||
directory_path = Path("directory")
|
||
# 若不存在则创建
|
||
directory_path.mkdir(parents=True, exist_ok=True)
|
||
|
||
# 初始化影像件识别规则引擎
|
||
recognize_decision = decision(Path("rules/影像件是否需要数据提取.json"))
|
||
|
||
# 初始化JINJA2环境
|
||
environment = Environment(loader=FileSystemLoader("."))
|
||
|
||
# 添加DATE过滤器
|
||
environment.filters["date"] = lambda date: (
|
||
date.strftime("%Y-%m-%d") if date else "长期"
|
||
)
|
||
# 加载赔案档案模版
|
||
template = environment.get_template("template.html")
|
||
|
||
# 遍历工作目录中赔案目录
|
||
for case_path in [
|
||
case_path for case_path in directory_path.iterdir() if case_path.is_dir()
|
||
]:
|
||
|
||
# 初始化赔案档案
|
||
dossier = {
|
||
"影像件层": [],
|
||
"赔案层": {
|
||
"赔案编号": (
|
||
case_number := case_path.stem
|
||
), # 假设赔案已签收,系统已生成赔案编号并根据签收时填报保单编号可知保险总公司、保险分公司和投保公司
|
||
"签收保单编号": "3291120243205000000002",
|
||
"保险总公司": "中银保险有限公司",
|
||
"保险分公司": None, # 实验阶段保险分公司、投保公司和申请时间为NONE
|
||
"投保公司": None,
|
||
"申请时间": None,
|
||
"申请人信息": {},
|
||
"受益人信息": {},
|
||
"被保人信息": {}, # 实验阶段被保人信息为空字典
|
||
"其它信息": {},
|
||
},
|
||
"发票层": [],
|
||
"小项层": [],
|
||
}
|
||
|
||
# 遍历赔案目录中影像件地址
|
||
for image_index, image_path in enumerate(
|
||
sorted(case_path.glob(pattern="*"), key=lambda x: x.stat().st_ctime), 1
|
||
):
|
||
dossier["影像件层"].append(
|
||
{
|
||
"影像件序号": (image_index := f"{image_index:02d}"),
|
||
"影像件名称": (image_name := image_path.name),
|
||
}
|
||
)
|
||
|
||
# 若影像件格式非JPG/JPEG/PNG则跳过该影像件
|
||
if (image_format := image_path.suffix.lower().lstrip(".")) not in [
|
||
"jpg",
|
||
"jpeg",
|
||
"png",
|
||
]:
|
||
dossier["影像件层"][-1]["已分类"] = "否,不支持的影像件"
|
||
continue
|
||
|
||
# 影像件读取
|
||
image = cv2.imread(image_path.as_posix(), cv2.IMREAD_GRAYSCALE)
|
||
# 若发生异常则跳过该影像件
|
||
if image is None:
|
||
dossier["影像件层"][-1]["已分类"] = "否,读取异常"
|
||
continue
|
||
|
||
# 影像件压缩(输出BASE64编码)
|
||
image_guid, image_base64 = images_compression()
|
||
# 若发生异常则跳过该影像件
|
||
if image_guid is None or image_base64 is None:
|
||
dossier["影像件层"][-1]["已分类"] = "否,压缩异常"
|
||
continue
|
||
|
||
# 通过请求深圳快瞳影像件分类接口获取影像件类型和方向
|
||
image_type, image_orientation = images_classification()
|
||
# 若发生异常则跳过该影像件
|
||
if image_type is None or image_orientation is None:
|
||
dossier["影像件层"][-1]["已分类"] = "否,影像件分类异常"
|
||
continue
|
||
|
||
# 若影像件方向非0度,则影像件旋正并在此压缩
|
||
if image_orientation != "0度":
|
||
# 影像件旋正
|
||
image = cv2.rotate(
|
||
image,
|
||
{
|
||
"顺时针90度": cv2.ROTATE_90_COUNTERCLOCKWISE, # 逆时针旋转90度
|
||
"180度": cv2.ROTATE_180, # 旋转180度
|
||
"逆时针90度": cv2.ROTATE_90_CLOCKWISE, # 顺时针旋转90度
|
||
}[image_orientation],
|
||
)
|
||
# 影像件再次压缩
|
||
image_guid, image_base64 = images_compression()
|
||
if image_guid is None or image_base64 is None:
|
||
dossier["影像件层"][-1]["已分类"] = "否,压缩异常"
|
||
continue
|
||
|
||
dossier["影像件层"][-1].update({"已分类": "是", "影像件类型": image_type})
|
||
|
||
# 根据保险总公司和影像件类型评估影像件是否需要数据提取,若无需数据提取则跳过该影像件(例如,中银保险有限公司理赔申请书包含户名、开户银行和银行账号,无需识别银行卡)
|
||
if not recognize_decision.evaluate(
|
||
{
|
||
"insurer": (insurer := dossier["赔案层"]["保险总公司"]),
|
||
"image_type": image_type,
|
||
}
|
||
)["result"]["extract"]:
|
||
dossier["影像件层"][-1]["已识别"] = "否,无需识别"
|
||
continue
|
||
|
||
# 根据影像件类型匹配影像件数据提取
|
||
# noinspection PyUnreachableCode
|
||
match image_type:
|
||
case "居民身份证(正背面)" | "居民身份证(正面)" | "居民身份证(背面)":
|
||
extraction = idcard_extraction()
|
||
# 若发生异常则跳过该影像件
|
||
if extraction is None:
|
||
dossier["影像件层"][-1]["已识别"] = "否,无法识别"
|
||
continue
|
||
|
||
if image_type in ["居民身份证(正背面)", "居民身份证(正面)"]:
|
||
dossier["赔案层"]["申请人信息"].update(
|
||
{
|
||
"证件有效期起": datetime.strptime(
|
||
extraction["有效期起"], "%Y-%m-%d"
|
||
),
|
||
"证件有效期止": (
|
||
date
|
||
if (date := extraction["有效期止"]) == "长期"
|
||
else datetime.strptime(date, "%Y-%m-%d")
|
||
), # 若证件有效期止为NONE默认为“长期”,
|
||
}
|
||
) # 原则上由影像件数据提取环节负责数据标准化,赔案档案数据填充环节负责数据机构化
|
||
|
||
if image_type in ["居民身份证(正背面)", "居民身份证(背面)"]:
|
||
dossier["赔案层"]["申请人信息"].update(
|
||
{
|
||
"姓名": extraction["姓名"],
|
||
"证件类型": "居民身份证",
|
||
"证件号码": extraction["公民身份号码"],
|
||
"性别": extraction["性别"],
|
||
"出生": datetime.strptime(
|
||
extraction["出生"], "%Y-%m-%d"
|
||
), # 默认日期格式为%Y-%m-%d
|
||
"省": (
|
||
address := parse_location(extraction["住址"])
|
||
).get("province"),
|
||
"地": address.get("city"),
|
||
"县": address.get("county"),
|
||
"详细地址": address.get("detail"),
|
||
}
|
||
)
|
||
|
||
case "银行卡":
|
||
extraction = bankcard_extraction()
|
||
# 若发生异常则跳过该影像件
|
||
if extraction is None:
|
||
dossier["影像件层"][-1]["已识别"] = "否,无法识别"
|
||
continue
|
||
|
||
dossier["赔案层"]["受益人信息"].update(
|
||
{
|
||
"开户银行": extraction["银行名称"],
|
||
"银行账号": extraction["卡号"],
|
||
}
|
||
)
|
||
|
||
case "增值税发票" | "门诊收费票据" | "住院收费票据":
|
||
extraction = invoice_extraction()
|
||
# 若发生异常则跳过该影像件
|
||
if extraction is None:
|
||
dossier["影像件层"][-1]["已识别"] = "否,无法识别"
|
||
continue
|
||
|
||
dossier["发票层"].append(
|
||
{
|
||
"关联影像件序号": image_index,
|
||
"票据类型": extraction["票据类型"],
|
||
"票据号码": extraction["票据号码"],
|
||
"票据代码": (
|
||
extraction["票据代码"]
|
||
if extraction["票据代码"]
|
||
else "--"
|
||
), # 数电票无票据代码,校验码同票据号码
|
||
"开票日期": datetime.strptime(
|
||
extraction["开票日期"], "%Y-%m-%d"
|
||
),
|
||
"校验码后六位": (
|
||
check_code[-6:]
|
||
if (check_code := extraction["校验码"])
|
||
else "--"
|
||
),
|
||
"医药机构": extraction["收款方"],
|
||
"就诊人": (
|
||
match.group("name")
|
||
if (
|
||
match := re.search(
|
||
r"^(?P<name>[^((]+)", extraction["付款方"]
|
||
)
|
||
)
|
||
else extraction["付款方"]
|
||
),
|
||
"票据金额": Decimal(extraction["票据金额"]).quantize(
|
||
Decimal("0.00"), rounding=ROUND_HALF_UP
|
||
), # 默认金额转为小数,保留两位小数
|
||
"查验状态": extraction["查验状态"],
|
||
"项目": (
|
||
pandas.DataFrame(extraction["项目"])
|
||
.assign(
|
||
数量=lambda dataframe: dataframe["数量"].apply(
|
||
lambda row: (
|
||
Decimal(row).quantize(
|
||
Decimal("0.00"),
|
||
rounding=ROUND_HALF_UP,
|
||
)
|
||
if row
|
||
else Decimal("0.00")
|
||
)
|
||
),
|
||
金额=lambda dataframe: dataframe["金额"].apply(
|
||
lambda row: (
|
||
Decimal(row).quantize(
|
||
Decimal("0.00"),
|
||
rounding=ROUND_HALF_UP,
|
||
)
|
||
if row
|
||
else Decimal("0.00")
|
||
)
|
||
),
|
||
)
|
||
.groupby(by="名称", as_index=False)
|
||
.agg(数量=("数量", "sum"), 金额=("金额", "sum"))
|
||
.assign(
|
||
大项=lambda dataframe: dataframe["名称"].apply(
|
||
lambda row: (
|
||
match.group("category")
|
||
if (
|
||
match := re.match(
|
||
r"^\*(?P<category>.+?)\*.*$",
|
||
row,
|
||
)
|
||
)
|
||
else row
|
||
)
|
||
),
|
||
小项=lambda dataframe: dataframe["名称"].apply(
|
||
lambda row: (
|
||
re.sub(
|
||
r"[^\u4e00-\u9fa5a-zA-Z0-9./%*]",
|
||
"",
|
||
match.group("name"),
|
||
)
|
||
if (
|
||
match := re.match(
|
||
r"^\*.+?\*(?:\[[^]]+])?(?P<name>[^\s(]+)(?:\([^\s(]+\))?(?:.*?)?$",
|
||
row,
|
||
)
|
||
)
|
||
else ""
|
||
)
|
||
),
|
||
)
|
||
.loc[
|
||
lambda dataframe: dataframe["金额"] != 0,
|
||
["名称", "大项", "小项", "数量", "金额"],
|
||
]
|
||
.to_dict(orient="records")
|
||
),
|
||
"就诊类型": (
|
||
"药店购药"
|
||
if "增值税发票" in image_type
|
||
else (
|
||
"门诊就诊"
|
||
if "门诊收费票据" in image_type
|
||
else "住院治疗"
|
||
)
|
||
),
|
||
}
|
||
)
|
||
|
||
case "理赔申请书":
|
||
# 根据保险总公司匹配理赔申请书
|
||
# noinspection PyUnreachableCode
|
||
match insurer:
|
||
case "中银保险有限公司":
|
||
extraction = common_extraction(
|
||
application_form="中行员工福利保障计划索赔申请书"
|
||
)
|
||
# 若识别异常则跳过该影像件
|
||
if extraction is None:
|
||
dossier["影像件层"][-1]["已识别"] = "否,无法识别"
|
||
continue
|
||
|
||
dossier["赔案层"]["申请人信息"].update(
|
||
{
|
||
"与被保险人关系": "本人", # 中银保险有限公司:默认申请人与被保险人关系为本人
|
||
"年龄": (
|
||
Decimal(age).quantize(
|
||
Decimal("0"),
|
||
rounding=ROUND_HALF_UP,
|
||
)
|
||
if (
|
||
age := extraction.get("基础信息", {}).get(
|
||
"年龄", "--"
|
||
)
|
||
).isdigit()
|
||
else age
|
||
), # 若年龄仅数字则转为小数、取整,否则默认为“--”
|
||
"手机号": (
|
||
phone_number
|
||
if re.match(
|
||
r"^1[3-9]\d{9}$",
|
||
phone_number := extraction.get(
|
||
"基础信息", {}
|
||
).get("手机", "--"),
|
||
)
|
||
else phone_number
|
||
), # 若手机未正则匹配手机号格式则为“--”
|
||
}
|
||
)
|
||
|
||
dossier["赔案层"]["受益人信息"].update(
|
||
{
|
||
"与被保险人关系": "本人", # 中银保险有限公司:默认受益人与被保人关系为本人
|
||
"户名": (
|
||
account_name
|
||
if (
|
||
account_name := extraction.get(
|
||
"基础信息", {}
|
||
).get("户名")
|
||
)
|
||
else "--"
|
||
), # 若户名为NONE则为“--”
|
||
"开户银行": (
|
||
account_name
|
||
if (
|
||
account_name := extraction.get(
|
||
"基础信息", {}
|
||
).get("开户银行")
|
||
)
|
||
else "--"
|
||
), # 若开户银行为NONE则为“--”
|
||
"银行账号": (
|
||
account_name
|
||
if (
|
||
account_name := extraction.get(
|
||
"基础信息", {}
|
||
).get("账号")
|
||
)
|
||
is not None
|
||
else "--"
|
||
), # 若银行账号为NONE则为“--”
|
||
}
|
||
)
|
||
|
||
dossier["赔案层"]["其它信息"]["自述症状"] = (
|
||
("、".join(diagnoses))
|
||
if (
|
||
diagnoses := sorted(
|
||
set(
|
||
"、".join(
|
||
[
|
||
diagnosis
|
||
for invoice in extraction.get(
|
||
"票据表格", []
|
||
)
|
||
if (
|
||
diagnosis := invoice.get("诊断")
|
||
)
|
||
]
|
||
).split("、")
|
||
)
|
||
)
|
||
)
|
||
else "--"
|
||
)
|
||
|
||
case _:
|
||
dossier["影像件层"][-1]["已识别"] = "否,无法识别"
|
||
continue
|
||
|
||
case _:
|
||
dossier["影像件层"][-1]["已识别"] = "否,无法识别"
|
||
continue
|
||
|
||
dossier["影像件层"][-1].update(
|
||
{
|
||
"已识别": "是",
|
||
"识别结果": extraction,
|
||
}
|
||
)
|
||
|
||
# 发票层根据开票日期顺序排序
|
||
dossier["发票层"] = sorted(
|
||
dossier["发票层"], key=lambda x: (x["开票日期"], x["票据号码"])
|
||
)
|
||
|
||
# 构建小项层
|
||
# noinspection PyTypeChecker
|
||
dossier["小项层"] = (
|
||
pandas.DataFrame(
|
||
[
|
||
{
|
||
"小项": item["小项"],
|
||
"数量": item["数量"],
|
||
"金额": item["金额"],
|
||
}
|
||
for invoice in dossier["发票层"]
|
||
for item in invoice["项目"]
|
||
]
|
||
)
|
||
.groupby(by="小项", as_index=False)
|
||
.agg(数量=("数量", "sum"), 金额=("金额", "sum"))
|
||
.to_dict(orient="records")
|
||
)
|
||
|
||
for invoice in dossier["发票层"]:
|
||
# noinspection PyTypeChecker
|
||
invoice["推定疾病"] = disease_diagnosis(
|
||
items="、".join(sorted(set([item["小项"] for item in invoice["项目"]])))
|
||
)
|
||
|
||
print(dossier)
|
||
exit()
|
||
|
||
with open(f"dossiers/{case_number}.html", "w", encoding="utf-8") as file:
|
||
file.write(
|
||
template.render(
|
||
{
|
||
"dossier": dossier,
|
||
}
|
||
)
|
||
)
|