diff --git a/utils/caches.db b/utils/caches.db index f8b6917..806280a 100644 Binary files a/utils/caches.db and b/utils/caches.db differ diff --git a/utils/client.py b/utils/client.py index 803f11b..06a9624 100644 --- a/utils/client.py +++ b/utils/client.py @@ -379,9 +379,6 @@ class HTTPClient: self.message = message super().__init__(self.message) - def __str__(self): - return f"请求发生异常(status={self.status}, code={self.code},message={self.message})" - class Parameters(BaseModel): """ 请求参数模型,支持自动校验 @@ -395,7 +392,7 @@ class HTTPClient: ) headers: Optional[Dict[str, str]] = Field(default=None, description="请求头") data: Optional[Dict[str, Any]] = Field(default=None, description="表单数据") - json_data: Optional[Dict[str, Any]] = Field( + json_: Optional[Dict[str, Any]] = Field( default=None, alias="json", description="JSON数据" ) files: Optional[ @@ -417,7 +414,7 @@ class HTTPClient: @model_validator(mode="after") def validate_data(self): """校验:表单数据和JSON数据互斥""" - if self.data is not None and self.json_data is not None: + if self.data is not None and self.json_ is not None: raise ValueError("表单数据和JSON数据不能同时使用") return self @@ -649,9 +646,9 @@ class HTTPClient: } # 过滤JSON数据中None值 - if parameters.get("json_data") is not None: - parameters["json_data"] = { - k: v for k, v in parameters["json_data"].items() if v is not None + if parameters.get("json") is not None: + parameters["json"] = { + k: v for k, v in parameters["json"].items() if v is not None } # 使用流式传输 @@ -696,14 +693,14 @@ class HTTPClient: else None ) message = ( - response.json().get("message", str(exception).splitlines()[0]) + response.json().get("message", response.text) if response is not None else str(exception).splitlines()[0] ) except Exception: status = None message = f"{method} {parameters["url"]} 请求发生异常:{str(exception).splitlines()[0]}" - raise self.RequestException(status=status, message=message) from exception + return self.RequestException(status=status, message=message).__dict__ # 处理响应 @staticmethod diff --git a/票据理赔自动化/abandoned.py b/票据理赔自动化/abandoned.py new file mode 100644 index 0000000..7284d89 --- /dev/null +++ b/票据理赔自动化/abandoned.py @@ -0,0 +1,58 @@ +def general_text_recognize(image) -> str: + """ + 通用文本识别 + :param image: 影像件 + :return: 识别文本 + """ + # 请求深圳快瞳通用文本识别接口 + response = http_client.post( + url=(url := "https://ai.inspirvision.cn/s/api/ocr/general"), + headers={ + "X-RequestId-Header": image["影像件唯一标识"] + }, # 以影像件唯一标识作为请求唯一标识,用于双方联查 + data={ + "token": authenticator.get_token(servicer="szkt"), # 获取深圳快瞳访问令牌 + "imgBase64": f"data:image/{image["影像件格式"].lstrip(".")};base64,{image["影像件BASE64编码"]}", + }, + guid=md5((url + image["影像件唯一标识"]).encode("utf-8")).hexdigest().upper(), + ) + # TODO: 若响应非成功则流转至人工处理 + if not (response.get("status") == 200 and response.get("code") == 0): + raise RuntimeError("请求深圳快瞳通用文本识别接口发生异常") + + blocks = [] + for block in response["data"]: + # noinspection PyTypeChecker + blocks.append( + [ + int(block["itemPolygon"]["x"]), # 文本块左上角的X坐标 + int(block["itemPolygon"]["y"]), # 文本块左上角的Y坐标 + int(block["itemPolygon"]["height"]), # 文本块左上角的高度 + block["value"], # 文本块的文本内容 + ] + ) + # 使用俄罗斯方块方法整理文本块,先按照文本块的Y坐标升序(从上到下) + blocks.sort(key=lambda x: x[1]) + + lines = [] + for idx, block in enumerate(blocks[1:]): + if idx == 0: + line = [blocks[0]] + continue + # 若当前文本块的Y坐标和当前文本行的平均Y坐标差值小于阈值则归为同一文本行,否则另起一文本行(分行) + if ( + block[1] - numpy.array([e[1] for e in line]).mean() + < numpy.array([e[2] for e in line]).mean() + ): + line.append(block) + else: + lines.append(line) + line = [block] + lines.append(line) + + blocks = [] + for line in lines: + blocks.extend( + [re.sub(r"\s", "", x[3]) for x in sorted(line, key=lambda x: x[0])] + ) # 按照文本块的X坐标升序(从左到右)并去除文本块的文本内容中所有空字符 + return "\n".join(blocks) diff --git a/票据理赔自动化/main.py b/票据理赔自动化/main.py index 6292d63..05cfde6 100644 --- a/票据理赔自动化/main.py +++ b/票据理赔自动化/main.py @@ -438,70 +438,6 @@ if __name__ == "__main__": :return: 空 """ - # noinspection PyShadowingNames - def general_text_recognize(image) -> str: - """ - 通用文本识别 - :param image: 影像件 - :return: 识别文本 - """ - # 请求深圳快瞳通用文本识别接口 - response = http_client.post( - url=(url := "https://ai.inspirvision.cn/s/api/ocr/general"), - headers={ - "X-RequestId-Header": image["影像件唯一标识"] - }, # 以影像件唯一标识作为请求唯一标识,用于双方联查 - data={ - "token": authenticator.get_token( - servicer="szkt" - ), # 获取深圳快瞳访问令牌 - "imgBase64": f"data:image/{image["影像件格式"].lstrip(".")};base64,{image["影像件BASE64编码"]}", - }, - guid=md5((url + image["影像件唯一标识"]).encode("utf-8")) - .hexdigest() - .upper(), - ) - # TODO: 若响应非成功则流转至人工处理 - if not (response.get("status") == 200 and response.get("code") == 0): - raise RuntimeError("请求深圳快瞳通用文本识别接口发生异常") - - blocks = [] - for block in response["data"]: - # noinspection PyTypeChecker - blocks.append( - [ - int(block["itemPolygon"]["x"]), # 文本块左上角的X坐标 - int(block["itemPolygon"]["y"]), # 文本块左上角的Y坐标 - int(block["itemPolygon"]["height"]), # 文本块左上角的高度 - block["value"], # 文本块的文本内容 - ] - ) - # 使用俄罗斯方块方法整理文本块,先按照文本块的Y坐标升序(从上到下) - blocks.sort(key=lambda x: x[1]) - - lines = [] - for idx, block in enumerate(blocks[1:]): - if idx == 0: - line = [blocks[0]] - continue - # 若当前文本块的Y坐标和当前文本行的平均Y坐标差值小于阈值则归为同一文本行,否则另起一文本行(分行) - if ( - block[1] - numpy.array([e[1] for e in line]).mean() - < numpy.array([e[2] for e in line]).mean() - ): - line.append(block) - else: - lines.append(line) - line = [block] - lines.append(line) - - blocks = [] - for line in lines: - blocks.extend( - [re.sub(r"\s", "", x[3]) for x in sorted(line, key=lambda x: x[0])] - ) # 按照文本块的X坐标升序(从左到右)并去除文本块的文本内容中所有空字符 - return "\n".join(blocks) - # TODO: 后续添加居民身份证(国徽面)和居民身份证(头像面)合并 # noinspection PyShadowingNames def identity_card_recognize(image, insurance_company, insurer_company) -> None: @@ -512,6 +448,27 @@ if __name__ == "__main__": :param insurer_company: 保险分公司 :return: 空 """ + + # noinspection PyShadowingNames + def calculate_age(report_time: datetime, birthday: datetime) -> int: + """ + 按照报案时间计算周岁 + :param report_time: 报案时间 + :param birthday: 出生日期 + :return 周岁 + """ + # 年龄 + age = report_time.year - birthday.year + + # 若报案时未到生日则年龄减去1 + if (report_time.month, report_time.day) < ( + birthday.month, + birthday.day, + ): + age -= 1 + + return age + # 请求深圳快瞳居民身份证识别接口 response = http_client.post( url=(url := "https://ai.inspirvision.cn/s/api/ocr/identityCard"), @@ -562,16 +519,21 @@ if __name__ == "__main__": "证件类型": (identity_type := "居民身份证"), "证件号码": (indentity_number := response["data"]["idNo"]), "性别": response["data"]["sex"], - "出生": datetime.strptime( - response["data"]["birthday"], "%Y-%m-%d" - ), # 深圳快瞳居民身份证识别接口中出生由字符串(%Y.%m.%d)转为日期,日期格式默认为%Y-%m-%d - "省": ( + "出生日期": ( + birthday := datetime.strptime( + response["data"]["birthday"], "%Y-%m-%d" + ) + ), # 深圳快瞳居民身份证识别接口中出生由字符串转为日期,日期格式默认为%Y-%m-%d + "年龄": calculate_age( + dossier["报案层"]["报案时间"], birthday + ), # 按照报案时间计算周岁 + "所在省": ( address := parse_location(response["data"]["address"]) ).get( "province" ), # 就住址解析为省、市、区和详细地址 - "市": address.get("city"), - "区": address.get("county"), + "所在市": address.get("city"), + "所在区": address.get("county"), "详细地址": address.get("detail"), } ) @@ -599,71 +561,191 @@ if __name__ == "__main__": """ # noinspection PyShadowingNames - def boc_application_recognize(image: str) -> str: + def mlm_recognize(image, schema) -> Dict[str, Any]: """ - 中银保险有限公司-理赔申请书识别并整合至赔案档案 + 使用多模态大模型就理赔申请书进行光学字符识别并结构化识别结果 :param image: 影像件 - :return: 空 + :param schema: 识别结果的JSON格式 + :return: 识别结果 """ - # 请求硅基流动的大语言模型接口 - # noinspection PyTypeChecker + # 尝试请求火山引擎多模态大模型接口至就消息内容JSON反序列化 response = http_client.post( - url="https://api.siliconflow.cn/v1/chat/completions", + url="https://ark.cn-beijing.volces.com/api/v3/chat/completions", headers={ - "Authorization": "Bearer sk-xsnuwirjjphhfdbvznfdfjqlinfdlrnlxuhkbbqynfnbhiqz", + "Authorization": "Bearer 2c28ab07-888c-45be-84a2-fc4b2cb5f3f2", "Content-Type": "application/json; charset=utf-8", }, - json={ - "model": (model := "THUDM/GLM-4.1V-9B-Thinking"), - "messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": ( - text := "请以JSON字符串的形式输出识别结果" - ), - }, - { - "type": "image_url", - "image_url": { - "url": f"data:image/{image["影像件格式"]};base64,{image["影像件BASE64编码"]}" + json=( + json_ := { + "model": "doubao-seed-1-6-251015", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": f"data:image/{image["影像件格式"].lstrip(".")};base64,{image["影像件BASE64编码"]}" + }, }, - }, - ], - } - ], - "max_tokens": 8192, - "temperature": 0.95, - "top_p": 0.7, - "top_k": 2, - "frequency_penalty": 1.1, - "thinking_budget": 8192, - }, - guid=md5((model + text + image["影像件唯一标识"]).encode("utf-8")) + { + "type": "text", + "text": "请就理赔申请书进行光学字符识别、结构化识别结果并返回符合Schema的JSON数据", + }, + ], + } + ], + "temperature": 0.2, # 采样温度,基于strict平衡稳定和容错 + "thinking": { + "type": "disabled", + }, # 不使用深度思考能力 + "response_format": { + "type": "json_schema", + "json_schema": { + "name": "就理赔申请书进行光学字符识别并结构化识别结果", + "schema": schema, + "strict": True, # 启用严格遵循模式 + }, + }, + } + ), + guid=md5( + json.dumps( + json_, + sort_keys=True, + ensure_ascii=False, + ).encode("utf-8") + ) .hexdigest() .upper(), ) + # 就消息内容JSON反序列化 + # noinspection PyTypeChecker + return json.loads(response["choices"][0]["message"]["content"]) - print(response) - exit() + # noinspection PyShadowingNames + def boc_application_recognize(image: str) -> None: + """ + 就中银保险有限公司的理赔申请书识别并整合至赔案档案 + :param image: 影像件 + :return: 空 + """ + # 识别结果的JSON格式 + schema = { + "type": "object", + "description": "识别结果对象", + "properties": { + "申请人": { + "type": "string", + "description": "申请人,若无数据则为空字符串", + }, + "性别": { + "type": "string", + "description": "性别,若无数据则为空字符串", + }, + "年龄": { + "type": "string", + "description": "年龄,若无数据则为空字符串", + }, + "手机": { + "type": "string", + "description": "手机,若无数据则为空字符串", + }, + "所属分支行及部门": { + "type": "string", + "description": "所属分支行及部门,若无数据则为空字符串", + }, + "身份证号码": { + "type": "string", + "description": "身份证号码,若无数据则为空字符串", + }, + "就诊记录": { + "type": "array", + "description": "所有就诊记录数组", + "items": { + "type": "object", + "description": "每条就诊记录对象", + "properties": { + "就诊序号": { + "type": "string", + "description": "就诊序号,若无数据则为空字符串", + }, + "发票日期": { + "type": "string", + "description": "发票日期,若无数据则为空字符串,若有数据则格式为YYYY/MM/DD", + }, + "发票上的就诊医院/药店": { + "type": "string", + "description": "发票上的就诊医院/药店,若无数据则为空字符串", + }, + "票据张数": { + "type": "string", + "description": "票据张数,若无数据则为空字符串", + }, + "票据金额": { + "type": "string", + "description": "票据金额,若无数据则为空字符串,若有数据则保留两位小数", + }, + "诊断": { + "type": "string", + "description": "诊断,若无数据则为空字符串", + }, + }, + "required": [ + "发票日期", + "发票上的就诊医院/药店", + "诊断", + ], # 就诊记录必须字段 + "additionalProperties": False, # 禁止就就诊记录新增属性 + }, + }, + "票据金额合计": { + "type": "string", + "description": "票据金额合计,若无数据则为空字符串,若有数据则保留两位小数", + }, + "开户银行": { + "type": "string", + "description": "开户银行,若无数据则为空字符串,请注意开户银行可能为多行", + }, + "户名": { + "type": "string", + "description": "户名,若无数据则为空字符串", + }, + "账号": { + "type": "string", + "description": "账号,若无数据则为空字符串", + }, + }, + "required": [ + "申请人", + "手机", + "身份证号码", + "就诊记录", + "开户银行", + "户名", + "账号", + ], # 识别结果的JSON结构必须字段 + "additionalProperties": False, # 禁止就识别结果的JSON结构新增属性 + } - recognition = ( - json.loads(match.group("json")) - if ( - match := re.search( - r"```json\s*(?P\{.*})\s*```", - response["choices"][0]["message"]["content"], - re.DOTALL, - ) + # noinspection PyBroadException + try: + # 使用多模态大模型就理赔申请书进行光学字符识别并结构化识别结果 + recognition = mlm_recognize(image, schema) + + dossier["受益人层"].update( + { + "开户行": recognition["开户银行"], + "户名": recognition["户名"], + "户号": recognition["账号"], + "手机号": recognition["手机"], + } + ) + except Exception: + # TODO: 若非成功则流转至人工处理 + raise RuntimeError( + "就中银保险有限公司的理赔申请书识别并整合至赔案档案发生异常" ) - else None - ) - - print(recognition) - - exit() # 根据保险分公司匹配结构化识别文本方法 match insurer_company: @@ -679,24 +761,26 @@ if __name__ == "__main__": :return: 空 """ # 初始化票据数据 - receipt = {"影像件编号": image_index} - # 请求深圳快瞳票据查验接口(兼容增值税发票、医疗门诊/住院收费票据) + receipt = {"影像件编号": image["影像件编号"]} + # 先请求深圳快瞳票据查验接口(兼容增值税发票、医疗门诊/住院收费票据) response = http_client.post( url=(url := "https://ai.inspirvision.cn/s/api/ocr/invoiceCheckAll"), - headers={"X-RequestId-Header": image_guid}, + headers={"X-RequestId-Header": image["影像件唯一标识"]}, data={ "token": authenticator.get_token( servicer="szkt" ), # 获取深圳快瞳访问令牌 - "imgBase64": f"data:image/{image_format.lstrip(".")};base64,{image_base64}", + "imgBase64": f"data:image/{image["影像件格式"].lstrip(".")};base64,{image["影像件BASE64编码"]}", }, - guid=md5((url + image_guid).encode("utf-8")).hexdigest().upper(), + guid=md5((url + image["影像件唯一标识"]).encode("utf-8")) + .hexdigest() + .upper(), ) # 若查验成功则直接整合至赔案档案 if response.get("status") == 200 and response.get("code") == 10000: # noinspection PyTypeChecker match response["data"]["productCode"]: - # 增值税发票,目前深圳快瞳支持全电和全电纸质增值税发票查验 + # 增值税发票,目前深圳快瞳支持全电发票和全电纸质发票、区块链发票和增值税发票查验 case "003082": # noinspection PyTypeChecker receipt.update( @@ -706,8 +790,8 @@ if __name__ == "__main__": if response["data"]["details"]["invoiceTypeNo"] == "0" else "红票" - ), - "票据号码": response["data"]["details"]["number"], + ), # 红票为状态为失控、作废、已红冲、部分红冲和全额红冲的票据 + "票据号": response["data"]["details"]["number"], "票据代码": ( response["data"]["details"]["code"] if response["data"]["details"]["code"] @@ -715,9 +799,7 @@ if __name__ == "__main__": ), # 全电发票无发票代码,深圳快瞳票据查验接口中票据代码由空字符转为None "开票日期": datetime.strptime( response["data"]["details"]["date"], "%Y年%m月%d日" - ).strftime( - "%Y-%m-%d" - ), # 深圳快瞳票据查验接口中开票日期由字符串(%Y年%m月%d日)转为日期 + ), # 深圳快瞳票据查验接口中开票日期由字符串转为日期 "校验码": response["data"]["details"]["check_code"], "开票金额": Decimal( response["data"]["details"]["total"] @@ -725,49 +807,45 @@ if __name__ == "__main__": Decimal("0.00"), rounding=ROUND_HALF_UP, ), # 深圳快瞳票据查验接口中开票金额由字符串转为Decimal,保留两位小数 - "姓名": response["data"]["details"]["buyer"], + "出险人": response["data"]["details"]["buyer"], "购药及就医机构": ( institution := response["data"]["details"]["seller"] ), - "备注": ( - response["data"]["details"]["remark"] - if response["data"]["details"]["remark"] - else None - ), # 深圳快瞳票据查验接口中备注由空字符转为None - "费项层": [ - { - "名称": item["name"], - "规格": ( - item["specification"] - if item["specification"] - else None - ), # 深圳快瞳票据查验接口中明细规则由空字符转为None - "单位": ( - item["unit"] if item["unit"] else None - ), # 深圳快瞳票据查验接口中明细单位由空字符转为None - "数量": ( - Decimal(item["quantity"]).quantize( - Decimal("0.00"), - rounding=ROUND_HALF_UP, - ) - if item["quantity"] - else None - ), # 深圳快瞳票据查验接口中明细单位由空字符转为None,若非空字符由字符串转为Decimal,保留两位小数 - "金额": ( - Decimal(item["total"]) - + Decimal(item["tax"]) - ).quantize( - Decimal("0.00"), - rounding=ROUND_HALF_UP, - ), # 深圳快瞳票据查验接口中明细金额税额由字符串转为Decimal,保留两位小数,求和 - } - for item in response["data"]["details"].get( - "items", [] - ) - ], } ) + "费项层": [ + { + "名称": item["name"], + "规格": ( + item["specification"] + if item["specification"] + else None + ), # 深圳快瞳票据查验接口中明细规则由空字符转为None + "单位": ( + item["unit"] if item["unit"] else None + ), # 深圳快瞳票据查验接口中明细单位由空字符转为None + "数量": ( + Decimal(item["quantity"]).quantize( + Decimal("0.00"), + rounding=ROUND_HALF_UP, + ) + if item["quantity"] + else None + ), # 深圳快瞳票据查验接口中明细单位由空字符转为None,若非空字符由字符串转为Decimal,保留两位小数 + "金额": ( + Decimal(item["total"]) + + Decimal(item["tax"]) + ).quantize( + Decimal("0.00"), + rounding=ROUND_HALF_UP, + ), # 深圳快瞳票据查验接口中明细金额税额由字符串转为Decimal,保留两位小数,求和 + } + for item in response["data"]["details"].get( + "items", [] + ) + ], + # 查询并获取单条购药及就医机构类型 institution_type = master_data.query_institution_type( institution @@ -1396,8 +1474,9 @@ if __name__ == "__main__": dossier["受益人层"].update( { "开户行": response["data"]["bankInfo"], - "户名": None, + "户名": "", "户号": response["data"]["cardNo"].replace(" ", ""), + "手机号": "", } ) @@ -1428,8 +1507,7 @@ if __name__ == "__main__": application_recognize(image, insurer_company) case "增值税发票" | "医疗门诊收费票据" | "医疗住院收费票据": # 票据识别并整合至赔案档案 - # receipt_recognize(image) - pass + receipt_recognize(image) case "银行卡": # 银行卡识别并整合至赔案档案 bank_card_recognize(image) @@ -1445,9 +1523,7 @@ if __name__ == "__main__": "保险分公司": ( insurer_company := "中银保险有限公司苏州分公司" ), # 指定保险分公司 - "报案时间": datetime(2025, 7, 25, 12, 0, 0).strftime( - "%Y-%m-%d %H:%M:%S" - ), # 指定报案时间 + "报案时间": datetime(2025, 7, 25, 12, 0, 0), # 指定报案时间 "赔案号": (case_number := case_path.stem), # 设定:赔案目录名称为赔案号 }, "影像件层": [], @@ -1795,4 +1871,6 @@ if __name__ == "__main__": } ) ) + + """