Python/票据理赔自动化/abandoned.py

175 lines
6.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

def general_text_recognize(image) -> str:
"""
通用文本识别
:param image: 影像件
:return: 识别文本
"""
# 请求深圳快瞳通用文本识别接口
response = http_client.post(
url=(url := "https://ai.inspirvision.cn/s/api/ocr/general"),
headers={
"X-RequestId-Header": image["影像件唯一标识"]
}, # 以影像件唯一标识作为请求唯一标识,用于双方联查
data={
"token": authenticator.get_token(servicer="szkt"), # 获取深圳快瞳访问令牌
"imgBase64": f"data:image/{image["影像件格式"].lstrip(".")};base64,{image["影像件BASE64编码"]}",
},
guid=md5((url + image["影像件唯一标识"]).encode("utf-8")).hexdigest().upper(),
)
# TODO: 若响应非成功则流转至人工处理
if not (response.get("status") == 200 and response.get("code") == 0):
raise RuntimeError("请求深圳快瞳通用文本识别接口发生异常")
blocks = []
for block in response["data"]:
# noinspection PyTypeChecker
blocks.append(
[
int(block["itemPolygon"]["x"]), # 文本块左上角的X坐标
int(block["itemPolygon"]["y"]), # 文本块左上角的Y坐标
int(block["itemPolygon"]["height"]), # 文本块左上角的高度
block["value"], # 文本块的文本内容
]
)
# 使用俄罗斯方块方法整理文本块先按照文本块的Y坐标升序从上到下
blocks.sort(key=lambda x: x[1])
lines = []
for idx, block in enumerate(blocks[1:]):
if idx == 0:
line = [blocks[0]]
continue
# 若当前文本块的Y坐标和当前文本行的平均Y坐标差值小于阈值则归为同一文本行否则另起一文本行分行
if (
block[1] - numpy.array([e[1] for e in line]).mean()
< numpy.array([e[2] for e in line]).mean()
):
line.append(block)
else:
lines.append(line)
line = [block]
lines.append(line)
blocks = []
for line in lines:
blocks.extend(
[re.sub(r"\s", "", x[3]) for x in sorted(line, key=lambda x: x[0])]
) # 按照文本块的X坐标升序从左到右并去除文本块的文本内容中所有空字符
return "\n".join(blocks)
class JiojioTokenizer:
"""中文分词器"""
def __init__(self):
# 初始化jiojio分词器
# noinspection PyBroadException
try:
jiojio.init()
except:
raise RuntimeError("初始化jiojio分词器发生异常")
# noinspection PyShadowingNames
@staticmethod
def callback(text: str, flags: int, cursor) -> None:
"""
分词回调函数
:param text: 待分词文本
:param flags: FTS5分词场景标记位
:param cursor: FTS5分词回传游标
return 无
"""
if not text or not isinstance(text, str):
return
tokens = []
begin_idx = 0 # 当前分词开始索引
for word in jiojio.cut(text):
if word.strip() == "":
begin_idx += len(word)
continue
tokens.append(
(word, begin_idx, end_idx := begin_idx + len(word))
) # SQLite FTS5要求回传分词语音文本开始和结束索引
begin_idx = end_idx
for token, begin_idx, end_idx in tokens:
cursor.send((token, begin_idx, end_idx))
# 实例化jiojio分词器
self.threads.jiojio_tokenizer = self.JiojioTokenizer()
# 创建分词器方法
def create_tokenizer_module(tokenizer):
class JiojioTokenizerModule:
"""创建jiojio分词器方法"""
# noinspection PyShadowingNames
@staticmethod
def tokenize(text: str, flags: int, cursor) -> None:
tokenizer.callback(text, flags, cursor)
return JiojioTokenizerModule()
self.threads.connection.create_module(
"jiojio_fts5_module",
create_tokenizer_module(self.threads.jiojio_tokenizer),
)
self.threads.connection.execute(
"""
CREATE VIRTUAL TABLE IF NOT EXISTS jiojio_tokenizer USING fts5tokenizer(jiojio_fts5_module)
"""
)
{
"code": 0,
"status": 200,
"message": "success",
"serialNo": "3a08935648632621760512",
"data": [
{"desc": "金额", "value": "175.22"},
{
"desc": "项目名称",
"value": "*化学药品制剂*[海露]玻璃酸钠滴眼液0.1%*10ml支/盒",
},
{"desc": "数量", "value": "2"},
{"desc": "规格型号", "value": ""},
{"desc": "税额", "value": "22.78"},
{"desc": "税率", "value": "13%"},
{"desc": "单位", "value": ""},
{"desc": "单价", "value": "87.61"},
{"desc": "金额1", "value": "-69.42"},
{
"desc": "项目名称1",
"value": "*化学药品制剂*[海露]玻璃酸钠滴眼液0.1%*10ml/支/盒",
},
{"desc": "数量1", "value": ""},
{"desc": "规格型号1", "value": ""},
{"desc": "税额1", "value": "-9.02"},
{"desc": "税率1", "value": "13%"},
{"desc": "单位1", "value": ""},
{"desc": "单价1", "value": ""},
{"desc": "发票名称", "value": "电子发票(普通发票)"},
{"desc": "全电票标签", "value": ""},
{"desc": "发票号码", "value": "25447200000045325946"},
{"desc": "开票日期", "value": "2025年01月20日"},
{"desc": "购买方名称", "value": "唐敏华"},
{"desc": "购买方识别号", "value": ""},
{"desc": "销售方名称", "value": "广州美团大药房有限公司"},
{"desc": "销售方识别号", "value": "91440100MAC1CAJH27"},
{"desc": "合计金额", "value": "¥105.80"},
{"desc": "合计税额", "value": "¥13.76"},
{"desc": "金额小计", "value": ""},
{"desc": "税额小计", "value": ""},
{"desc": "价税合计(大写)", "value": "壹佰壹拾玖圆伍角陆分"},
{"desc": "小写金额", "value": "¥119.56"},
{"desc": "备注", "value": ""},
{"desc": "开票人", "value": "张景景"},
{"desc": "发票类型", "value": "电子发票(普通发票)"},
{"desc": "监制章存在性判断", "value": "True"},
{"desc": "总页数", "value": ""},
{"desc": "当前页数", "value": ""},
],
}