parent
b15b89e077
commit
9df91dac94
|
|
@ -0,0 +1,18 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
根据现普康票据理赔自动化最小化实现
|
||||
功能清单
|
||||
https://liubiren.feishu.cn/docx/WFjTdBpzroUjQvxxrNIcKvGnneh?from=from_copylink
|
||||
"""
|
||||
|
||||
from decimal import Decimal, ROUND_HALF_UP
|
||||
|
||||
bill_amount = Decimal("1.2223").quantize(
|
||||
Decimal("0.00"),
|
||||
rounding=ROUND_HALF_UP,
|
||||
)
|
||||
|
||||
result = {"票据金额": bill_amount}
|
||||
|
||||
print(result)
|
||||
530
票据理赔自动化/main.py
530
票据理赔自动化/main.py
|
|
@ -26,6 +26,7 @@ from zen import ZenDecision, ZenEngine
|
|||
from utils.client import Authenticator, HTTPClient
|
||||
|
||||
|
||||
|
||||
# from utils.ocr import fuzzy_match
|
||||
|
||||
|
||||
|
|
@ -439,9 +440,9 @@ if __name__ == "__main__":
|
|||
case (1, _):
|
||||
image_type = "医疗费用清单"
|
||||
case (5, _):
|
||||
image_type = "门诊收费票据"
|
||||
image_type = "医疗门诊收费票据"
|
||||
case (3, _):
|
||||
image_type = "住院收费票据"
|
||||
image_type = "医疗住院收费票据"
|
||||
case (18, _):
|
||||
image_type = "理赔申请书"
|
||||
case _:
|
||||
|
|
@ -632,221 +633,195 @@ if __name__ == "__main__":
|
|||
)
|
||||
# 若查验为真票或红票则直接整合至赔案档案
|
||||
if response.get("status") == 200 and response.get("code") == 10000:
|
||||
pass
|
||||
|
||||
# 若查验为假票或无法查验
|
||||
else:
|
||||
if response.get("status") == 400 and (response.get("code") == 10100 or response.get("code") == 10001):
|
||||
receipt["查验结果"] = "假票"
|
||||
else:
|
||||
receipt["查验结果"] = "无法查验"
|
||||
|
||||
|
||||
|
||||
match response["data"]["productCode"]:
|
||||
case "003082": # 增值税发票
|
||||
extraction = {
|
||||
"票据类型": {
|
||||
"10108": "数电票",
|
||||
"10101": "增值税普通发票",
|
||||
"10100": "增值税专用发票",
|
||||
"30100": "数电票",
|
||||
"30101": "数电票",
|
||||
"30104": "增值税专用发票",
|
||||
"30105": "数电票",
|
||||
"10106": "区块链电子发票",
|
||||
"30109": "数电票",
|
||||
"30121": "增值税普通发票",
|
||||
"10102": "增值税普通发票",
|
||||
"10103": "增值税普通发票",
|
||||
"10107": "数电票",
|
||||
}.get(response["data"]["type"], "其它增值税发票"),
|
||||
"票据号码": response["data"]["details"]["number"],
|
||||
"票据代码": (
|
||||
code
|
||||
if (code := response["data"]["details"]["code"])
|
||||
else None
|
||||
), # 深圳快瞳票据查验中数电票票据代码为空字符,转为NONE
|
||||
"开票日期": datetime.strptime(
|
||||
response["data"]["details"]["date"], "%Y年%m月%d日"
|
||||
).strftime(
|
||||
"%Y-%m-%d"
|
||||
), # 深圳快瞳票据查验中就增值税发票开票日期格式为%Y年%m月%d日,转为%Y-%m-%d
|
||||
"校验码": response["data"]["details"]["check_code"],
|
||||
"收款方": response["data"]["details"]["seller"],
|
||||
"付款方": response["data"]["details"]["buyer"],
|
||||
"票据金额": format(
|
||||
Decimal(response["data"]["details"]["total"]).quantize(
|
||||
Decimal("0.00"),
|
||||
rounding=ROUND_HALF_UP,
|
||||
# noinspection PyTypeChecker
|
||||
match response["data"]["productCode"]:
|
||||
# 增值税发票,目前深圳快瞳支持全电和全电纸质增值税发票查验
|
||||
case "003082":
|
||||
# noinspection PyTypeChecker
|
||||
receipt.update({
|
||||
"查验状态": (
|
||||
"真票"
|
||||
if response["data"]["details"]["invoiceTypeNo"] == "0"
|
||||
else "红票"
|
||||
),
|
||||
".2f",
|
||||
),
|
||||
"查验状态": (
|
||||
"真票"
|
||||
if response["data"]["details"]["invoiceTypeNo"] == "0"
|
||||
else "红票"
|
||||
),
|
||||
"备注": (
|
||||
remark
|
||||
if (remark := response["data"]["details"]["remark"])
|
||||
else None
|
||||
), # 深圳快瞳票据查验中增值税发票备注可能为空字符,转为NONE
|
||||
"项目": [
|
||||
{
|
||||
"名称": item["name"],
|
||||
"规格": (
|
||||
specification
|
||||
if (specification := item["specification"])
|
||||
else None
|
||||
"票据号码": response["data"]["details"]["number"],
|
||||
"票据代码": (
|
||||
response["data"]["details"]["code"]
|
||||
if response["data"]["details"]["code"]
|
||||
else None
|
||||
), # 全电发票无发票代码,深圳快瞳票据查验接口中票据代码由空字符转为None
|
||||
"开票日期": datetime.strptime(
|
||||
response["data"]["details"]["date"], "%Y年%m月%d日"
|
||||
).strftime(
|
||||
"%Y-%m-%d"
|
||||
), # 深圳快瞳票据查验接口中开票日期由字符串(%Y年%m月%d日)转为日期
|
||||
"校验码": response["data"]["details"]["check_code"],
|
||||
"开票金额": Decimal(response["data"]["details"]["total"]).quantize(
|
||||
Decimal("0.00"),
|
||||
rounding=ROUND_HALF_UP,
|
||||
), # 深圳快瞳票据查验接口中开票金额由字符串转为Decimal,保留两位小数
|
||||
"姓名": response["data"]["details"]["buyer"],
|
||||
"购药及就医机构": response["data"]["details"]["seller"],
|
||||
"备注": (
|
||||
response["data"]["details"]["remark"]
|
||||
if response["data"]["details"]["remark"]
|
||||
else None
|
||||
), # 深圳快瞳票据查验接口中备注由空字符转为None
|
||||
"明细层": [
|
||||
{
|
||||
"名称": item["name"],
|
||||
"规格": (
|
||||
item["specification"]
|
||||
if item["specification"]
|
||||
else None
|
||||
), # 深圳快瞳票据查验接口中明细规则由空字符转为None
|
||||
"单位": (
|
||||
item["unit"]
|
||||
if item["unit"]
|
||||
else None
|
||||
), # 深圳快瞳票据查验接口中明细单位由空字符转为None
|
||||
"数量": (Decimal(item["quantity"]).quantize(
|
||||
Decimal("0.00"),
|
||||
rounding=ROUND_HALF_UP,
|
||||
)
|
||||
if item["quantity"]
|
||||
else None
|
||||
), # 深圳快瞳票据查验接口中明细单位由空字符转为None,若非空字符由字符串转为Decimal,保留两位小数
|
||||
"金额": (
|
||||
Decimal(item["total"])
|
||||
+ Decimal(item["tax"])
|
||||
).quantize(
|
||||
Decimal("0.00"),
|
||||
rounding=ROUND_HALF_UP,
|
||||
), # 深圳快瞳票据查验接口中明细金额税额由字符串转为Decimal,保留两位小数,求和
|
||||
}
|
||||
for item in response["data"]["details"].get("items", [])
|
||||
],
|
||||
})
|
||||
# 门诊/住院收费票据
|
||||
case "003081":
|
||||
# noinspection PyTypeChecker
|
||||
receipt.update({
|
||||
"查验状态": (
|
||||
"真票"
|
||||
if response["data"]["flushedRed"] == "true"
|
||||
else "红票"
|
||||
),
|
||||
"票据号码": response["data"]["billNumber"],
|
||||
"票据代码": (
|
||||
response["data"]["billCode"]
|
||||
if response["data"]["billCode"]
|
||||
else None
|
||||
), # 部分地区医疗收费票据无发票代码,深圳快瞳票据查验接口中票据代码由空字符转为None
|
||||
"开票日期": parse(response["data"][
|
||||
"invoiceDate"
|
||||
]).strftime(
|
||||
"%Y-%m-%d"
|
||||
), # 深圳快瞳票据查验接口中开票日期由字符串(%Y-%m-%d)转为日期
|
||||
"校验码": response["data"]["checkCode"],
|
||||
"票据金额": Decimal(response["data"]["amount"]).quantize(
|
||||
Decimal("0.00"),
|
||||
rounding=ROUND_HALF_UP,
|
||||
),
|
||||
"单位": unit if (unit := item["unit"]) else None,
|
||||
"数量": (
|
||||
format(
|
||||
Decimal(quantity).quantize(
|
||||
"姓名": response["data"]["payer"],
|
||||
"购药及就医机构": response["data"]["receivablesInstitution"],
|
||||
|
||||
|
||||
|
||||
"医保支付": format(
|
||||
Decimal(
|
||||
response["data"].get("medicarePay", "0.00")
|
||||
).quantize(
|
||||
Decimal("0.00"),
|
||||
rounding=ROUND_HALF_UP,
|
||||
),
|
||||
".2f",
|
||||
),
|
||||
"其它支付": format(
|
||||
Decimal(
|
||||
response["data"].get("otherPayment", "0.00")
|
||||
).quantize(
|
||||
Decimal("0.00"),
|
||||
rounding=ROUND_HALF_UP,
|
||||
),
|
||||
".2f",
|
||||
),
|
||||
"个人自付": format(
|
||||
Decimal(
|
||||
response["data"].get("personalPay", "0.00")
|
||||
).quantize(
|
||||
Decimal("0.00"),
|
||||
rounding=ROUND_HALF_UP,
|
||||
),
|
||||
".2f",
|
||||
),
|
||||
"自付一": format(
|
||||
Decimal(
|
||||
response["data"].get("self_pay_one", "0.00")
|
||||
).quantize(
|
||||
Decimal("0.00"),
|
||||
rounding=ROUND_HALF_UP,
|
||||
),
|
||||
".2f",
|
||||
), # 深圳快瞳票据查验中就部分地区无自付一
|
||||
"自付二": format(
|
||||
Decimal(
|
||||
response["data"].get("classificationPays", "0.00")
|
||||
).quantize(
|
||||
Decimal("0.00"),
|
||||
rounding=ROUND_HALF_UP,
|
||||
),
|
||||
".2f",
|
||||
), # 深圳快瞳票据查验中就部分地区无自付二
|
||||
"个人自费": format(
|
||||
Decimal(
|
||||
response["data"].get("personalExpense", "0.00")
|
||||
).quantize(
|
||||
Decimal("0.00"),
|
||||
rounding=ROUND_HALF_UP,
|
||||
),
|
||||
".2f",
|
||||
),
|
||||
"住院日期": (
|
||||
parse(date.split("-")[0]).strftime("%Y-%m-%d")
|
||||
if (date := response["data"].get("hospitalizationDate"))
|
||||
else None
|
||||
), # 深圳快瞳票据查验中就收费票据住院日期格式为%Y%m%d-%Y%m%d,即住院日期-出院日期
|
||||
"出院日期": (
|
||||
parse(date.split("-")[1]).strftime("%Y-%m-%d")
|
||||
if date
|
||||
else None
|
||||
),
|
||||
"医疗机构类型": response["data"]["institutionsType"],
|
||||
"项目": [
|
||||
{
|
||||
"名称": item["itemName"],
|
||||
"规格": item[
|
||||
"medical_level"
|
||||
], # 甲类无自付、乙类有自付、丙类全自付
|
||||
"单位": item["unit"],
|
||||
"数量": format(
|
||||
Decimal(item["number"]).quantize(
|
||||
Decimal("0.00"),
|
||||
rounding=ROUND_HALF_UP,
|
||||
),
|
||||
".2f",
|
||||
)
|
||||
if (quantity := item["quantity"])
|
||||
else None
|
||||
),
|
||||
"金额": format(
|
||||
(
|
||||
Decimal(item["total"])
|
||||
+ Decimal(item["tax"])
|
||||
).quantize(
|
||||
Decimal("0.00"),
|
||||
rounding=ROUND_HALF_UP,
|
||||
),
|
||||
".2f",
|
||||
), # 价税合计
|
||||
}
|
||||
for item in response["data"]["details"].get("items", [])
|
||||
],
|
||||
} # 深圳快瞳票据查验中就部分增值税发票仅可查,数据标准化抛出异常
|
||||
return extraction
|
||||
"金额": format(
|
||||
Decimal(item["totalAmount"]).quantize(
|
||||
Decimal("0.00"),
|
||||
rounding=ROUND_HALF_UP,
|
||||
),
|
||||
".2f",
|
||||
),
|
||||
}
|
||||
for item in response["data"]["feedetails"]
|
||||
],
|
||||
})
|
||||
|
||||
|
||||
# 若请求深圳快瞳票据查验接口或解析发生异常,则根据影像件类型请求深圳快瞳增值税发票/收费票据识别接口
|
||||
|
||||
case "003081": # 门诊/住院收费票据
|
||||
extraction = {
|
||||
"票据类型": (
|
||||
"门诊收费票据"
|
||||
if "门诊" in response["data"]["billName"]
|
||||
else "住院收费票据"
|
||||
),
|
||||
"票据号码": response["data"]["billNumber"],
|
||||
"票据代码": response["data"]["billCode"],
|
||||
"开票日期": response["data"][
|
||||
"invoiceDate"
|
||||
], # 深圳快瞳票据查验中就收费票据开票日期格式为%Y-%m-%d
|
||||
"校验码": response["data"]["checkCode"],
|
||||
"收款方": response["data"]["payeeName"],
|
||||
"付款方": response["data"]["payer"],
|
||||
"票据金额": format(
|
||||
Decimal(response["data"]["amount"]).quantize(
|
||||
Decimal("0.00"),
|
||||
rounding=ROUND_HALF_UP,
|
||||
),
|
||||
".2f",
|
||||
),
|
||||
"查验状态": {"true": "真票", "false": "红票"}[
|
||||
response["data"]["flushedRed"]
|
||||
],
|
||||
"备注": response["data"].get("remark"),
|
||||
"医保支付": format(
|
||||
Decimal(
|
||||
response["data"].get("medicarePay", "0.00")
|
||||
).quantize(
|
||||
Decimal("0.00"),
|
||||
rounding=ROUND_HALF_UP,
|
||||
),
|
||||
".2f",
|
||||
),
|
||||
"其它支付": format(
|
||||
Decimal(
|
||||
response["data"].get("otherPayment", "0.00")
|
||||
).quantize(
|
||||
Decimal("0.00"),
|
||||
rounding=ROUND_HALF_UP,
|
||||
),
|
||||
".2f",
|
||||
),
|
||||
"个人自付": format(
|
||||
Decimal(
|
||||
response["data"].get("personalPay", "0.00")
|
||||
).quantize(
|
||||
Decimal("0.00"),
|
||||
rounding=ROUND_HALF_UP,
|
||||
),
|
||||
".2f",
|
||||
),
|
||||
"自付一": format(
|
||||
Decimal(
|
||||
response["data"].get("self_pay_one", "0.00")
|
||||
).quantize(
|
||||
Decimal("0.00"),
|
||||
rounding=ROUND_HALF_UP,
|
||||
),
|
||||
".2f",
|
||||
), # 深圳快瞳票据查验中就部分地区无自付一
|
||||
"自付二": format(
|
||||
Decimal(
|
||||
response["data"].get("classificationPays", "0.00")
|
||||
).quantize(
|
||||
Decimal("0.00"),
|
||||
rounding=ROUND_HALF_UP,
|
||||
),
|
||||
".2f",
|
||||
), # 深圳快瞳票据查验中就部分地区无自付二
|
||||
"个人自费": format(
|
||||
Decimal(
|
||||
response["data"].get("personalExpense", "0.00")
|
||||
).quantize(
|
||||
Decimal("0.00"),
|
||||
rounding=ROUND_HALF_UP,
|
||||
),
|
||||
".2f",
|
||||
),
|
||||
"住院日期": (
|
||||
parse(date.split("-")[0]).strftime("%Y-%m-%d")
|
||||
if (date := response["data"].get("hospitalizationDate"))
|
||||
else None
|
||||
), # 深圳快瞳票据查验中就收费票据住院日期格式为%Y%m%d-%Y%m%d,即住院日期-出院日期
|
||||
"出院日期": (
|
||||
parse(date.split("-")[1]).strftime("%Y-%m-%d")
|
||||
if date
|
||||
else None
|
||||
),
|
||||
"医疗机构类型": response["data"]["institutionsType"],
|
||||
"项目": [
|
||||
{
|
||||
"名称": item["itemName"],
|
||||
"规格": item[
|
||||
"medical_level"
|
||||
], # 甲类无自付、乙类有自付、丙类全自付
|
||||
"单位": item["unit"],
|
||||
"数量": format(
|
||||
Decimal(item["number"]).quantize(
|
||||
Decimal("0.00"),
|
||||
rounding=ROUND_HALF_UP,
|
||||
),
|
||||
".2f",
|
||||
),
|
||||
"金额": format(
|
||||
Decimal(item["totalAmount"]).quantize(
|
||||
Decimal("0.00"),
|
||||
rounding=ROUND_HALF_UP,
|
||||
),
|
||||
".2f",
|
||||
),
|
||||
}
|
||||
for item in response["data"]["feedetails"]
|
||||
],
|
||||
}
|
||||
return extraction
|
||||
# 若请求深圳快瞳票据查验接口或解析发生异常,则根据影像件类型请求深圳快瞳增值税发票/收费票据识别接口
|
||||
except:
|
||||
# 影像件类型
|
||||
image_type = kwargs.get("image_type", globals()["image_type"])
|
||||
if image_type is None:
|
||||
|
|
@ -870,11 +845,11 @@ if __name__ == "__main__":
|
|||
guid=hashlib.md5((url + image_guid).encode("utf-8"))
|
||||
.hexdigest()
|
||||
.upper(),
|
||||
)
|
||||
# 若深圳快瞳增值税发票识别响应非成功则返回NONE
|
||||
)
|
||||
# 若深圳快瞳增值税发票识别响应非成功则返回None
|
||||
if not (
|
||||
response.get("status") == 200
|
||||
and response.get("code") == 0
|
||||
response.get("status") == 200
|
||||
and response.get("code") == 0
|
||||
):
|
||||
return None
|
||||
|
||||
|
|
@ -941,7 +916,7 @@ if __name__ == "__main__":
|
|||
),
|
||||
"金额": format(
|
||||
(
|
||||
Decimal(amount) + Decimal(tax)
|
||||
Decimal(amount) + Decimal(tax)
|
||||
).quantize(
|
||||
Decimal("0.00"),
|
||||
rounding=ROUND_HALF_UP,
|
||||
|
|
@ -950,55 +925,55 @@ if __name__ == "__main__":
|
|||
),
|
||||
}
|
||||
for name, specification, unit, quantity, amount, tax in zip(
|
||||
[
|
||||
component["value"]
|
||||
for component in response["data"]
|
||||
if re.match(
|
||||
r"^项目名称(\d+)?$",
|
||||
component["desc"],
|
||||
)
|
||||
],
|
||||
[
|
||||
component["value"]
|
||||
for component in response["data"]
|
||||
if re.match(
|
||||
r"^规格型号(\d+)?$",
|
||||
component["desc"],
|
||||
)
|
||||
],
|
||||
[
|
||||
component["value"]
|
||||
for component in response["data"]
|
||||
if re.match(
|
||||
r"^单位(\d+)?$",
|
||||
component["desc"],
|
||||
)
|
||||
],
|
||||
[
|
||||
component["value"]
|
||||
for component in response["data"]
|
||||
if re.match(
|
||||
r"^数量(\d+)?$",
|
||||
component["desc"],
|
||||
)
|
||||
],
|
||||
[
|
||||
component["value"]
|
||||
for component in response["data"]
|
||||
if re.match(
|
||||
r"^金额(\d+)?$",
|
||||
component["desc"],
|
||||
)
|
||||
],
|
||||
[
|
||||
component["value"]
|
||||
for component in response["data"]
|
||||
if re.match(
|
||||
r"^税额(\d+)?$",
|
||||
component["desc"],
|
||||
)
|
||||
],
|
||||
[
|
||||
component["value"]
|
||||
for component in response["data"]
|
||||
if re.match(
|
||||
r"^项目名称(\d+)?$",
|
||||
component["desc"],
|
||||
)
|
||||
],
|
||||
[
|
||||
component["value"]
|
||||
for component in response["data"]
|
||||
if re.match(
|
||||
r"^规格型号(\d+)?$",
|
||||
component["desc"],
|
||||
)
|
||||
],
|
||||
[
|
||||
component["value"]
|
||||
for component in response["data"]
|
||||
if re.match(
|
||||
r"^单位(\d+)?$",
|
||||
component["desc"],
|
||||
)
|
||||
],
|
||||
[
|
||||
component["value"]
|
||||
for component in response["data"]
|
||||
if re.match(
|
||||
r"^数量(\d+)?$",
|
||||
component["desc"],
|
||||
)
|
||||
],
|
||||
[
|
||||
component["value"]
|
||||
for component in response["data"]
|
||||
if re.match(
|
||||
r"^金额(\d+)?$",
|
||||
component["desc"],
|
||||
)
|
||||
],
|
||||
[
|
||||
component["value"]
|
||||
for component in response["data"]
|
||||
if re.match(
|
||||
r"^税额(\d+)?$",
|
||||
component["desc"],
|
||||
)
|
||||
],
|
||||
)
|
||||
]
|
||||
if invoice_type == "电子发票(普通发票)"
|
||||
else [
|
||||
|
|
@ -1024,31 +999,41 @@ if __name__ == "__main__":
|
|||
component["value"]
|
||||
for component in response["data"]
|
||||
if re.match(
|
||||
r"^项目名称明细(\d+)?$",
|
||||
component["desc"],
|
||||
)
|
||||
r"^项目名称明细(\d+)?$",
|
||||
component["desc"],
|
||||
)
|
||||
],
|
||||
[
|
||||
component["value"]
|
||||
for component in response["data"]
|
||||
if re.match(
|
||||
r"^项目数量明细(\d+)?$",
|
||||
component["desc"],
|
||||
)
|
||||
r"^项目数量明细(\d+)?$",
|
||||
component["desc"],
|
||||
)
|
||||
],
|
||||
[
|
||||
component["value"]
|
||||
for component in response["data"]
|
||||
if re.match(
|
||||
r"^项目金额明细(\d+)?$",
|
||||
component["desc"],
|
||||
)
|
||||
r"^项目金额明细(\d+)?$",
|
||||
component["desc"],
|
||||
)
|
||||
],
|
||||
)
|
||||
]
|
||||
),
|
||||
"查验状态": "无法查验",
|
||||
}
|
||||
|
||||
# 若查验为假票或无法查验
|
||||
else:
|
||||
if response.get("status") == 400 and (response.get("code") == 10100 or response.get("code") == 10001):
|
||||
receipt["查验结果"] = "假票"
|
||||
else:
|
||||
receipt["查验结果"] = "无法查验"
|
||||
|
||||
|
||||
|
||||
return extraction
|
||||
except:
|
||||
return None
|
||||
|
|
@ -1307,7 +1292,7 @@ if __name__ == "__main__":
|
|||
# 银行卡识别并整合至赔案档案
|
||||
bankcard_recognize(image_guid, image_format, image_base64)
|
||||
# TODO: 暂仅支持增值税发票识别和购药及就医类型为药店购药整合至赔案档案,后续逐步添加
|
||||
case "增值税发票" | "门诊收费票据" | "住院收费票据":
|
||||
case "增值税发票" | "医疗门诊收费票据" | "医疗住院收费票据":
|
||||
# 票据识别并整合至赔案档案
|
||||
receipt_recognize(image_guid, image_format, image_base64, image_type)
|
||||
|
||||
|
|
@ -1373,7 +1358,6 @@ if __name__ == "__main__":
|
|||
image_index, image_guid, image_format, image_base64, insurance_branch, image_type
|
||||
)
|
||||
|
||||
|
||||
"""
|
||||
|
||||
|
||||
|
|
|
|||
221
票据理赔自动化/test.py
221
票据理赔自动化/test.py
|
|
@ -1,221 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import json
|
||||
import re
|
||||
from csv import DictReader, DictWriter
|
||||
from pathlib import Path
|
||||
from typing import List, Dict
|
||||
|
||||
import torch
|
||||
from transformers import BertTokenizerFast, BertForTokenClassification
|
||||
|
||||
|
||||
# 命名实体识别
|
||||
class NER:
|
||||
def __init__(self):
|
||||
# 实体标签映射
|
||||
self.label_map = {
|
||||
0: "O", # 非药品命名实体
|
||||
1: "B-DRUG", # 药品命名实体-开始
|
||||
2: "I-DRUG", # 药品命名实体-中间
|
||||
}
|
||||
|
||||
# 加载预训练分词器
|
||||
self.tokenizer = BertTokenizerFast.from_pretrained(
|
||||
pretrained_model_name_or_path=Path("./models/bert-base-chinese").resolve()
|
||||
)
|
||||
|
||||
# 加载预训练模型
|
||||
self.model = BertForTokenClassification.from_pretrained(
|
||||
pretrained_model_name_or_path=Path("./models/bert-base-chinese").resolve(),
|
||||
)
|
||||
|
||||
# 设置模型为预测模式
|
||||
self.model.eval()
|
||||
|
||||
def recognize_drugs(self, text: str) -> List[Dict]:
|
||||
"""识别药品命名实体"""
|
||||
|
||||
if not text.strip():
|
||||
return []
|
||||
|
||||
# 分词编码
|
||||
inputs = self.tokenizer(
|
||||
text,
|
||||
return_tensors="pt",
|
||||
padding=True,
|
||||
truncation=True,
|
||||
return_offsets_mapping=True,
|
||||
)
|
||||
|
||||
# TOKEN于文本中起止位置
|
||||
offset_mapping = inputs.pop("offset_mapping")[0].cpu().numpy()
|
||||
|
||||
with torch.no_grad():
|
||||
# 模型预测
|
||||
outputs = self.model(**inputs)
|
||||
# 获取TOKEN预测标签
|
||||
predictions = torch.argmax(outputs.logits, dim=2)
|
||||
|
||||
entities = []
|
||||
current_entity = None
|
||||
|
||||
# 遍历所有TOKEN、预测标签索引和起止索引
|
||||
for token, offset, label_id in zip(
|
||||
self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]),
|
||||
offset_mapping,
|
||||
predictions[0].cpu().numpy(),
|
||||
):
|
||||
print(label_id)
|
||||
continue
|
||||
|
||||
# 映射TOKEN标签
|
||||
label = self.label_map.get(label_id, "O")
|
||||
|
||||
# 若遇到特殊TOKEN则跳过
|
||||
if (
|
||||
token in ["[CLS]", "[SEP]", "[PAD]"]
|
||||
or offset[0] == 0
|
||||
and offset[1] == 0
|
||||
):
|
||||
continue
|
||||
|
||||
if label == "B-DRUG":
|
||||
if current_entity:
|
||||
self._combine_tokens(current_entity, text)
|
||||
entities.append(current_entity)
|
||||
|
||||
current_entity = {
|
||||
"start": offset[0],
|
||||
"end": offset[1],
|
||||
"tokens": [token],
|
||||
"offsets": [offset],
|
||||
"type": label,
|
||||
}
|
||||
|
||||
elif label == "I-DRUG":
|
||||
if current_entity:
|
||||
if offset[0] == current_entity["end"]:
|
||||
current_entity["end"] = offset[1]
|
||||
current_entity["tokens"].append(token)
|
||||
current_entity["offsets"].append(offset)
|
||||
else:
|
||||
self._combine_tokens(current_entity, text)
|
||||
entities.append(current_entity)
|
||||
current_entity = {
|
||||
"start": offset[0],
|
||||
"end": offset[1],
|
||||
"tokens": [token],
|
||||
"offsets": [offset],
|
||||
"type": label,
|
||||
}
|
||||
|
||||
else:
|
||||
if current_entity:
|
||||
self._combine_tokens(current_entity, text)
|
||||
entities.append(current_entity)
|
||||
current_entity = None
|
||||
|
||||
if current_entity:
|
||||
self._combine_tokens(current_entity, text)
|
||||
entities.append(current_entity)
|
||||
|
||||
return entities
|
||||
|
||||
@staticmethod
|
||||
def _combine_tokens(current_entity: Dict, text: str):
|
||||
"""合并TOKEN"""
|
||||
|
||||
# 从文本中提取命名实体文本
|
||||
current_entity["text"] = text[current_entity["start"] : current_entity["end"]]
|
||||
|
||||
|
||||
"""
|
||||
|
||||
# 使用示例(需要训练好的模型)
|
||||
dl_ner = NER()
|
||||
text = "患者需要硫酸吗啡缓释片治疗癌症疼痛"
|
||||
entities = dl_ner.recognize_drugs(text)
|
||||
print(entities)
|
||||
|
||||
exit()
|
||||
|
||||
"""
|
||||
|
||||
|
||||
def drug_extraction(text) -> tuple[str, str | None]:
|
||||
"""药品数据提取"""
|
||||
|
||||
# 正则匹配两个“*”之间内容作为药品类别,第二个“*”之后内容作为药品名称。
|
||||
if match := re.match(
|
||||
pattern=r"\*(?P<drug_type>.*?)\*(?P<drug_name>.*)",
|
||||
string=(text := text.strip()),
|
||||
):
|
||||
# 药品类别
|
||||
drug_type = match.group("drug_type").strip()
|
||||
|
||||
# 药品名称
|
||||
drug_name = (
|
||||
match.group("drug_name")
|
||||
.upper() # 小写转大写
|
||||
.replace("(", " ")
|
||||
.replace(")", " ")
|
||||
.replace("(", " ")
|
||||
.replace(")", " ")
|
||||
.replace("[", " ")
|
||||
.replace("]", " ")
|
||||
.replace("【", " ")
|
||||
.replace("】", " ")
|
||||
.replace(":", " ")
|
||||
.replace(":", " ")
|
||||
.replace(",", " ")
|
||||
.replace(",", " ")
|
||||
.replace("·", " ")
|
||||
.replace("`", " ")
|
||||
.replace("@", " ")
|
||||
.replace("#", " ")
|
||||
.replace("*", " ")
|
||||
.replace("/", " ") # 就指定符号替换为空格
|
||||
.strip()
|
||||
)
|
||||
|
||||
# 就药品名称中多个空格替换为一个空格
|
||||
drug_name = re.sub(pattern=r"\s+", repl=" ", string=drug_name)
|
||||
|
||||
for section in drug_name.split(" "):
|
||||
print(section)
|
||||
|
||||
# 若匹配失败则药品类型默认为文本、药品名称默认为None
|
||||
else:
|
||||
drug_type, drug_name = text, None
|
||||
|
||||
return drug_type, drug_name
|
||||
|
||||
|
||||
dataframe = []
|
||||
|
||||
# 就票据查验结果和疾病对应关系进行数据清洗(暂仅考虑增值税发票且为真票)
|
||||
with open("票据查验结果和疾病对应关系.csv", "r", encoding="utf-8") as file:
|
||||
for row in DictReader(file):
|
||||
try:
|
||||
disease = row["疾病"]
|
||||
|
||||
response = json.loads(row["票据查验结果"])
|
||||
|
||||
# 遍历项目
|
||||
for item in response["data"]["details"]["items"]:
|
||||
|
||||
name = item["name"]
|
||||
|
||||
drug_extraction(name)
|
||||
|
||||
exit()
|
||||
|
||||
except Exception as e:
|
||||
print(e)
|
||||
exit()
|
||||
|
||||
with open("1.csv", "w", newline="", encoding="utf-8") as file:
|
||||
writer = DictWriter(file, fieldnames=dataframe[0].keys())
|
||||
writer.writeheader()
|
||||
writer.writerows(dataframe)
|
||||
Loading…
Reference in New Issue