日常更新

from macbook
This commit is contained in:
liubiren 2025-12-19 19:38:29 +08:00
parent 8cb2afcdae
commit 25b8822c80
4 changed files with 318 additions and 185 deletions

Binary file not shown.

View File

@ -379,9 +379,6 @@ class HTTPClient:
self.message = message self.message = message
super().__init__(self.message) super().__init__(self.message)
def __str__(self):
return f"请求发生异常(status={self.status} code={self.code}message={self.message})"
class Parameters(BaseModel): class Parameters(BaseModel):
""" """
请求参数模型支持自动校验 请求参数模型支持自动校验
@ -395,7 +392,7 @@ class HTTPClient:
) )
headers: Optional[Dict[str, str]] = Field(default=None, description="请求头") headers: Optional[Dict[str, str]] = Field(default=None, description="请求头")
data: Optional[Dict[str, Any]] = Field(default=None, description="表单数据") data: Optional[Dict[str, Any]] = Field(default=None, description="表单数据")
json_data: Optional[Dict[str, Any]] = Field( json_: Optional[Dict[str, Any]] = Field(
default=None, alias="json", description="JSON数据" default=None, alias="json", description="JSON数据"
) )
files: Optional[ files: Optional[
@ -417,7 +414,7 @@ class HTTPClient:
@model_validator(mode="after") @model_validator(mode="after")
def validate_data(self): def validate_data(self):
"""校验表单数据和JSON数据互斥""" """校验表单数据和JSON数据互斥"""
if self.data is not None and self.json_data is not None: if self.data is not None and self.json_ is not None:
raise ValueError("表单数据和JSON数据不能同时使用") raise ValueError("表单数据和JSON数据不能同时使用")
return self return self
@ -649,9 +646,9 @@ class HTTPClient:
} }
# 过滤JSON数据中None值 # 过滤JSON数据中None值
if parameters.get("json_data") is not None: if parameters.get("json") is not None:
parameters["json_data"] = { parameters["json"] = {
k: v for k, v in parameters["json_data"].items() if v is not None k: v for k, v in parameters["json"].items() if v is not None
} }
# 使用流式传输 # 使用流式传输
@ -696,14 +693,14 @@ class HTTPClient:
else None else None
) )
message = ( message = (
response.json().get("message", str(exception).splitlines()[0]) response.json().get("message", response.text)
if response is not None if response is not None
else str(exception).splitlines()[0] else str(exception).splitlines()[0]
) )
except Exception: except Exception:
status = None status = None
message = f"{method} {parameters["url"]} 请求发生异常:{str(exception).splitlines()[0]}" message = f"{method} {parameters["url"]} 请求发生异常:{str(exception).splitlines()[0]}"
raise self.RequestException(status=status, message=message) from exception return self.RequestException(status=status, message=message).__dict__
# 处理响应 # 处理响应
@staticmethod @staticmethod

View File

@ -0,0 +1,58 @@
def general_text_recognize(image) -> str:
"""
通用文本识别
:param image: 影像件
:return: 识别文本
"""
# 请求深圳快瞳通用文本识别接口
response = http_client.post(
url=(url := "https://ai.inspirvision.cn/s/api/ocr/general"),
headers={
"X-RequestId-Header": image["影像件唯一标识"]
}, # 以影像件唯一标识作为请求唯一标识,用于双方联查
data={
"token": authenticator.get_token(servicer="szkt"), # 获取深圳快瞳访问令牌
"imgBase64": f"data:image/{image["影像件格式"].lstrip(".")};base64,{image["影像件BASE64编码"]}",
},
guid=md5((url + image["影像件唯一标识"]).encode("utf-8")).hexdigest().upper(),
)
# TODO: 若响应非成功则流转至人工处理
if not (response.get("status") == 200 and response.get("code") == 0):
raise RuntimeError("请求深圳快瞳通用文本识别接口发生异常")
blocks = []
for block in response["data"]:
# noinspection PyTypeChecker
blocks.append(
[
int(block["itemPolygon"]["x"]), # 文本块左上角的X坐标
int(block["itemPolygon"]["y"]), # 文本块左上角的Y坐标
int(block["itemPolygon"]["height"]), # 文本块左上角的高度
block["value"], # 文本块的文本内容
]
)
# 使用俄罗斯方块方法整理文本块先按照文本块的Y坐标升序从上到下
blocks.sort(key=lambda x: x[1])
lines = []
for idx, block in enumerate(blocks[1:]):
if idx == 0:
line = [blocks[0]]
continue
# 若当前文本块的Y坐标和当前文本行的平均Y坐标差值小于阈值则归为同一文本行否则另起一文本行分行
if (
block[1] - numpy.array([e[1] for e in line]).mean()
< numpy.array([e[2] for e in line]).mean()
):
line.append(block)
else:
lines.append(line)
line = [block]
lines.append(line)
blocks = []
for line in lines:
blocks.extend(
[re.sub(r"\s", "", x[3]) for x in sorted(line, key=lambda x: x[0])]
) # 按照文本块的X坐标升序从左到右并去除文本块的文本内容中所有空字符
return "\n".join(blocks)

View File

@ -438,70 +438,6 @@ if __name__ == "__main__":
:return: :return:
""" """
# noinspection PyShadowingNames
def general_text_recognize(image) -> str:
"""
通用文本识别
:param image: 影像件
:return: 识别文本
"""
# 请求深圳快瞳通用文本识别接口
response = http_client.post(
url=(url := "https://ai.inspirvision.cn/s/api/ocr/general"),
headers={
"X-RequestId-Header": image["影像件唯一标识"]
}, # 以影像件唯一标识作为请求唯一标识,用于双方联查
data={
"token": authenticator.get_token(
servicer="szkt"
), # 获取深圳快瞳访问令牌
"imgBase64": f"data:image/{image["影像件格式"].lstrip(".")};base64,{image["影像件BASE64编码"]}",
},
guid=md5((url + image["影像件唯一标识"]).encode("utf-8"))
.hexdigest()
.upper(),
)
# TODO: 若响应非成功则流转至人工处理
if not (response.get("status") == 200 and response.get("code") == 0):
raise RuntimeError("请求深圳快瞳通用文本识别接口发生异常")
blocks = []
for block in response["data"]:
# noinspection PyTypeChecker
blocks.append(
[
int(block["itemPolygon"]["x"]), # 文本块左上角的X坐标
int(block["itemPolygon"]["y"]), # 文本块左上角的Y坐标
int(block["itemPolygon"]["height"]), # 文本块左上角的高度
block["value"], # 文本块的文本内容
]
)
# 使用俄罗斯方块方法整理文本块先按照文本块的Y坐标升序从上到下
blocks.sort(key=lambda x: x[1])
lines = []
for idx, block in enumerate(blocks[1:]):
if idx == 0:
line = [blocks[0]]
continue
# 若当前文本块的Y坐标和当前文本行的平均Y坐标差值小于阈值则归为同一文本行否则另起一文本行分行
if (
block[1] - numpy.array([e[1] for e in line]).mean()
< numpy.array([e[2] for e in line]).mean()
):
line.append(block)
else:
lines.append(line)
line = [block]
lines.append(line)
blocks = []
for line in lines:
blocks.extend(
[re.sub(r"\s", "", x[3]) for x in sorted(line, key=lambda x: x[0])]
) # 按照文本块的X坐标升序从左到右并去除文本块的文本内容中所有空字符
return "\n".join(blocks)
# TODO: 后续添加居民身份证(国徽面)和居民身份证(头像面)合并 # TODO: 后续添加居民身份证(国徽面)和居民身份证(头像面)合并
# noinspection PyShadowingNames # noinspection PyShadowingNames
def identity_card_recognize(image, insurance_company, insurer_company) -> None: def identity_card_recognize(image, insurance_company, insurer_company) -> None:
@ -512,6 +448,27 @@ if __name__ == "__main__":
:param insurer_company: 保险分公司 :param insurer_company: 保险分公司
:return: :return:
""" """
# noinspection PyShadowingNames
def calculate_age(report_time: datetime, birthday: datetime) -> int:
"""
按照报案时间计算周岁
:param report_time: 报案时间
:param birthday: 出生日期
:return 周岁
"""
# 年龄
age = report_time.year - birthday.year
# 若报案时未到生日则年龄减去1
if (report_time.month, report_time.day) < (
birthday.month,
birthday.day,
):
age -= 1
return age
# 请求深圳快瞳居民身份证识别接口 # 请求深圳快瞳居民身份证识别接口
response = http_client.post( response = http_client.post(
url=(url := "https://ai.inspirvision.cn/s/api/ocr/identityCard"), url=(url := "https://ai.inspirvision.cn/s/api/ocr/identityCard"),
@ -562,16 +519,21 @@ if __name__ == "__main__":
"证件类型": (identity_type := "居民身份证"), "证件类型": (identity_type := "居民身份证"),
"证件号码": (indentity_number := response["data"]["idNo"]), "证件号码": (indentity_number := response["data"]["idNo"]),
"性别": response["data"]["sex"], "性别": response["data"]["sex"],
"出生": datetime.strptime( "出生日期": (
response["data"]["birthday"], "%Y-%m-%d" birthday := datetime.strptime(
), # 深圳快瞳居民身份证识别接口中出生由字符串(%Y.%m.%d转为日期日期格式默认为%Y-%m-%d response["data"]["birthday"], "%Y-%m-%d"
"": ( )
), # 深圳快瞳居民身份证识别接口中出生由字符串转为日期,日期格式默认为%Y-%m-%d
"年龄": calculate_age(
dossier["报案层"]["报案时间"], birthday
), # 按照报案时间计算周岁
"所在省": (
address := parse_location(response["data"]["address"]) address := parse_location(response["data"]["address"])
).get( ).get(
"province" "province"
), # 就住址解析为省、市、区和详细地址 ), # 就住址解析为省、市、区和详细地址
"": address.get("city"), "所在": address.get("city"),
"": address.get("county"), "所在": address.get("county"),
"详细地址": address.get("detail"), "详细地址": address.get("detail"),
} }
) )
@ -599,71 +561,191 @@ if __name__ == "__main__":
""" """
# noinspection PyShadowingNames # noinspection PyShadowingNames
def boc_application_recognize(image: str) -> str: def mlm_recognize(image, schema) -> Dict[str, Any]:
""" """
中银保险有限公司-理赔申请书识别并整合至赔案档案 使用多模态大模型就理赔申请书进行光学字符识别并结构化识别结果
:param image: 影像件 :param image: 影像件
:return: :param schema: 识别结果的JSON格式
:return: 识别结果
""" """
# 请求硅基流动的大语言模型接口 # 尝试请求火山引擎多模态大模型接口至就消息内容JSON反序列化
# noinspection PyTypeChecker
response = http_client.post( response = http_client.post(
url="https://api.siliconflow.cn/v1/chat/completions", url="https://ark.cn-beijing.volces.com/api/v3/chat/completions",
headers={ headers={
"Authorization": "Bearer sk-xsnuwirjjphhfdbvznfdfjqlinfdlrnlxuhkbbqynfnbhiqz", "Authorization": "Bearer 2c28ab07-888c-45be-84a2-fc4b2cb5f3f2",
"Content-Type": "application/json; charset=utf-8", "Content-Type": "application/json; charset=utf-8",
}, },
json={ json=(
"model": (model := "THUDM/GLM-4.1V-9B-Thinking"), json_ := {
"messages": [ "model": "doubao-seed-1-6-251015",
{ "messages": [
"role": "user", {
"content": [ "role": "user",
{ "content": [
"type": "text", {
"text": ( "type": "image_url",
text := "请以JSON字符串的形式输出识别结果" "image_url": {
), "url": f"data:image/{image["影像件格式"].lstrip(".")};base64,{image["影像件BASE64编码"]}"
}, },
{
"type": "image_url",
"image_url": {
"url": f"data:image/{image["影像件格式"]};base64,{image["影像件BASE64编码"]}"
}, },
}, {
], "type": "text",
} "text": "请就理赔申请书进行光学字符识别、结构化识别结果并返回符合Schema的JSON数据",
], },
"max_tokens": 8192, ],
"temperature": 0.95, }
"top_p": 0.7, ],
"top_k": 2, "temperature": 0.2, # 采样温度基于strict平衡稳定和容错
"frequency_penalty": 1.1, "thinking": {
"thinking_budget": 8192, "type": "disabled",
}, }, # 不使用深度思考能力
guid=md5((model + text + image["影像件唯一标识"]).encode("utf-8")) "response_format": {
"type": "json_schema",
"json_schema": {
"name": "就理赔申请书进行光学字符识别并结构化识别结果",
"schema": schema,
"strict": True, # 启用严格遵循模式
},
},
}
),
guid=md5(
json.dumps(
json_,
sort_keys=True,
ensure_ascii=False,
).encode("utf-8")
)
.hexdigest() .hexdigest()
.upper(), .upper(),
) )
# 就消息内容JSON反序列化
# noinspection PyTypeChecker
return json.loads(response["choices"][0]["message"]["content"])
print(response) # noinspection PyShadowingNames
exit() def boc_application_recognize(image: str) -> None:
"""
就中银保险有限公司的理赔申请书识别并整合至赔案档案
:param image: 影像件
:return:
"""
# 识别结果的JSON格式
schema = {
"type": "object",
"description": "识别结果对象",
"properties": {
"申请人": {
"type": "string",
"description": "申请人,若无数据则为空字符串",
},
"性别": {
"type": "string",
"description": "性别,若无数据则为空字符串",
},
"年龄": {
"type": "string",
"description": "年龄,若无数据则为空字符串",
},
"手机": {
"type": "string",
"description": "手机,若无数据则为空字符串",
},
"所属分支行及部门": {
"type": "string",
"description": "所属分支行及部门,若无数据则为空字符串",
},
"身份证号码": {
"type": "string",
"description": "身份证号码,若无数据则为空字符串",
},
"就诊记录": {
"type": "array",
"description": "所有就诊记录数组",
"items": {
"type": "object",
"description": "每条就诊记录对象",
"properties": {
"就诊序号": {
"type": "string",
"description": "就诊序号,若无数据则为空字符串",
},
"发票日期": {
"type": "string",
"description": "发票日期若无数据则为空字符串若有数据则格式为YYYY/MM/DD",
},
"发票上的就诊医院/药店": {
"type": "string",
"description": "发票上的就诊医院/药店,若无数据则为空字符串",
},
"票据张数": {
"type": "string",
"description": "票据张数,若无数据则为空字符串",
},
"票据金额": {
"type": "string",
"description": "票据金额,若无数据则为空字符串,若有数据则保留两位小数",
},
"诊断": {
"type": "string",
"description": "诊断,若无数据则为空字符串",
},
},
"required": [
"发票日期",
"发票上的就诊医院/药店",
"诊断",
], # 就诊记录必须字段
"additionalProperties": False, # 禁止就就诊记录新增属性
},
},
"票据金额合计": {
"type": "string",
"description": "票据金额合计,若无数据则为空字符串,若有数据则保留两位小数",
},
"开户银行": {
"type": "string",
"description": "开户银行,若无数据则为空字符串,请注意开户银行可能为多行",
},
"户名": {
"type": "string",
"description": "户名,若无数据则为空字符串",
},
"账号": {
"type": "string",
"description": "账号,若无数据则为空字符串",
},
},
"required": [
"申请人",
"手机",
"身份证号码",
"就诊记录",
"开户银行",
"户名",
"账号",
], # 识别结果的JSON结构必须字段
"additionalProperties": False, # 禁止就识别结果的JSON结构新增属性
}
recognition = ( # noinspection PyBroadException
json.loads(match.group("json")) try:
if ( # 使用多模态大模型就理赔申请书进行光学字符识别并结构化识别结果
match := re.search( recognition = mlm_recognize(image, schema)
r"```json\s*(?P<json>\{.*})\s*```",
response["choices"][0]["message"]["content"], dossier["受益人层"].update(
re.DOTALL, {
) "开户行": recognition["开户银行"],
"户名": recognition["户名"],
"户号": recognition["账号"],
"手机号": recognition["手机"],
}
)
except Exception:
# TODO: 若非成功则流转至人工处理
raise RuntimeError(
"就中银保险有限公司的理赔申请书识别并整合至赔案档案发生异常"
) )
else None
)
print(recognition)
exit()
# 根据保险分公司匹配结构化识别文本方法 # 根据保险分公司匹配结构化识别文本方法
match insurer_company: match insurer_company:
@ -679,24 +761,26 @@ if __name__ == "__main__":
:return: :return:
""" """
# 初始化票据数据 # 初始化票据数据
receipt = {"影像件编号": image_index} receipt = {"影像件编号": image["影像件编号"]}
# 请求深圳快瞳票据查验接口(兼容增值税发票、医疗门诊/住院收费票据) # 请求深圳快瞳票据查验接口(兼容增值税发票、医疗门诊/住院收费票据)
response = http_client.post( response = http_client.post(
url=(url := "https://ai.inspirvision.cn/s/api/ocr/invoiceCheckAll"), url=(url := "https://ai.inspirvision.cn/s/api/ocr/invoiceCheckAll"),
headers={"X-RequestId-Header": image_guid}, headers={"X-RequestId-Header": image["影像件唯一标识"]},
data={ data={
"token": authenticator.get_token( "token": authenticator.get_token(
servicer="szkt" servicer="szkt"
), # 获取深圳快瞳访问令牌 ), # 获取深圳快瞳访问令牌
"imgBase64": f"data:image/{image_format.lstrip(".")};base64,{image_base64}", "imgBase64": f"data:image/{image["影像件格式"].lstrip(".")};base64,{image["影像件BASE64编码"]}",
}, },
guid=md5((url + image_guid).encode("utf-8")).hexdigest().upper(), guid=md5((url + image["影像件唯一标识"]).encode("utf-8"))
.hexdigest()
.upper(),
) )
# 若查验成功则直接整合至赔案档案 # 若查验成功则直接整合至赔案档案
if response.get("status") == 200 and response.get("code") == 10000: if response.get("status") == 200 and response.get("code") == 10000:
# noinspection PyTypeChecker # noinspection PyTypeChecker
match response["data"]["productCode"]: match response["data"]["productCode"]:
# 增值税发票,目前深圳快瞳支持全电和全电纸质增值税发票查验 # 增值税发票,目前深圳快瞳支持全电发票和全电纸质发票、区块链发票和增值税发票查验
case "003082": case "003082":
# noinspection PyTypeChecker # noinspection PyTypeChecker
receipt.update( receipt.update(
@ -706,8 +790,8 @@ if __name__ == "__main__":
if response["data"]["details"]["invoiceTypeNo"] if response["data"]["details"]["invoiceTypeNo"]
== "0" == "0"
else "红票" else "红票"
), ), # 红票为状态为失控、作废、已红冲、部分红冲和全额红冲的票据
"票据号": response["data"]["details"]["number"], "票据号": response["data"]["details"]["number"],
"票据代码": ( "票据代码": (
response["data"]["details"]["code"] response["data"]["details"]["code"]
if response["data"]["details"]["code"] if response["data"]["details"]["code"]
@ -715,9 +799,7 @@ if __name__ == "__main__":
), # 全电发票无发票代码深圳快瞳票据查验接口中票据代码由空字符转为None ), # 全电发票无发票代码深圳快瞳票据查验接口中票据代码由空字符转为None
"开票日期": datetime.strptime( "开票日期": datetime.strptime(
response["data"]["details"]["date"], "%Y年%m月%d" response["data"]["details"]["date"], "%Y年%m月%d"
).strftime( ), # 深圳快瞳票据查验接口中开票日期由字符串转为日期
"%Y-%m-%d"
), # 深圳快瞳票据查验接口中开票日期由字符串(%Y年%m月%d日转为日期
"校验码": response["data"]["details"]["check_code"], "校验码": response["data"]["details"]["check_code"],
"开票金额": Decimal( "开票金额": Decimal(
response["data"]["details"]["total"] response["data"]["details"]["total"]
@ -725,49 +807,45 @@ if __name__ == "__main__":
Decimal("0.00"), Decimal("0.00"),
rounding=ROUND_HALF_UP, rounding=ROUND_HALF_UP,
), # 深圳快瞳票据查验接口中开票金额由字符串转为Decimal保留两位小数 ), # 深圳快瞳票据查验接口中开票金额由字符串转为Decimal保留两位小数
"姓名": response["data"]["details"]["buyer"], "出险人": response["data"]["details"]["buyer"],
"购药及就医机构": ( "购药及就医机构": (
institution := response["data"]["details"]["seller"] institution := response["data"]["details"]["seller"]
), ),
"备注": (
response["data"]["details"]["remark"]
if response["data"]["details"]["remark"]
else None
), # 深圳快瞳票据查验接口中备注由空字符转为None
"费项层": [
{
"名称": item["name"],
"规格": (
item["specification"]
if item["specification"]
else None
), # 深圳快瞳票据查验接口中明细规则由空字符转为None
"单位": (
item["unit"] if item["unit"] else None
), # 深圳快瞳票据查验接口中明细单位由空字符转为None
"数量": (
Decimal(item["quantity"]).quantize(
Decimal("0.00"),
rounding=ROUND_HALF_UP,
)
if item["quantity"]
else None
), # 深圳快瞳票据查验接口中明细单位由空字符转为None若非空字符由字符串转为Decimal保留两位小数
"金额": (
Decimal(item["total"])
+ Decimal(item["tax"])
).quantize(
Decimal("0.00"),
rounding=ROUND_HALF_UP,
), # 深圳快瞳票据查验接口中明细金额税额由字符串转为Decimal保留两位小数求和
}
for item in response["data"]["details"].get(
"items", []
)
],
} }
) )
"费项层": [
{
"名称": item["name"],
"规格": (
item["specification"]
if item["specification"]
else None
), # 深圳快瞳票据查验接口中明细规则由空字符转为None
"单位": (
item["unit"] if item["unit"] else None
), # 深圳快瞳票据查验接口中明细单位由空字符转为None
"数量": (
Decimal(item["quantity"]).quantize(
Decimal("0.00"),
rounding=ROUND_HALF_UP,
)
if item["quantity"]
else None
), # 深圳快瞳票据查验接口中明细单位由空字符转为None若非空字符由字符串转为Decimal保留两位小数
"金额": (
Decimal(item["total"])
+ Decimal(item["tax"])
).quantize(
Decimal("0.00"),
rounding=ROUND_HALF_UP,
), # 深圳快瞳票据查验接口中明细金额税额由字符串转为Decimal保留两位小数求和
}
for item in response["data"]["details"].get(
"items", []
)
],
# 查询并获取单条购药及就医机构类型 # 查询并获取单条购药及就医机构类型
institution_type = master_data.query_institution_type( institution_type = master_data.query_institution_type(
institution institution
@ -1396,8 +1474,9 @@ if __name__ == "__main__":
dossier["受益人层"].update( dossier["受益人层"].update(
{ {
"开户行": response["data"]["bankInfo"], "开户行": response["data"]["bankInfo"],
"户名": None, "户名": "",
"户号": response["data"]["cardNo"].replace(" ", ""), "户号": response["data"]["cardNo"].replace(" ", ""),
"手机号": "",
} }
) )
@ -1428,8 +1507,7 @@ if __name__ == "__main__":
application_recognize(image, insurer_company) application_recognize(image, insurer_company)
case "增值税发票" | "医疗门诊收费票据" | "医疗住院收费票据": case "增值税发票" | "医疗门诊收费票据" | "医疗住院收费票据":
# 票据识别并整合至赔案档案 # 票据识别并整合至赔案档案
# receipt_recognize(image) receipt_recognize(image)
pass
case "银行卡": case "银行卡":
# 银行卡识别并整合至赔案档案 # 银行卡识别并整合至赔案档案
bank_card_recognize(image) bank_card_recognize(image)
@ -1445,9 +1523,7 @@ if __name__ == "__main__":
"保险分公司": ( "保险分公司": (
insurer_company := "中银保险有限公司苏州分公司" insurer_company := "中银保险有限公司苏州分公司"
), # 指定保险分公司 ), # 指定保险分公司
"报案时间": datetime(2025, 7, 25, 12, 0, 0).strftime( "报案时间": datetime(2025, 7, 25, 12, 0, 0), # 指定报案时间
"%Y-%m-%d %H:%M:%S"
), # 指定报案时间
"赔案号": (case_number := case_path.stem), # 设定:赔案目录名称为赔案号 "赔案号": (case_number := case_path.stem), # 设定:赔案目录名称为赔案号
}, },
"影像件层": [], "影像件层": [],
@ -1795,4 +1871,6 @@ if __name__ == "__main__":
} }
) )
) )
""" """