Python/票据理赔自动化/abandoned.py

59 lines
2.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

def general_text_recognize(image) -> str:
"""
通用文本识别
:param image: 影像件
:return: 识别文本
"""
# 请求深圳快瞳通用文本识别接口
response = http_client.post(
url=(url := "https://ai.inspirvision.cn/s/api/ocr/general"),
headers={
"X-RequestId-Header": image["影像件唯一标识"]
}, # 以影像件唯一标识作为请求唯一标识,用于双方联查
data={
"token": authenticator.get_token(servicer="szkt"), # 获取深圳快瞳访问令牌
"imgBase64": f"data:image/{image["影像件格式"].lstrip(".")};base64,{image["影像件BASE64编码"]}",
},
guid=md5((url + image["影像件唯一标识"]).encode("utf-8")).hexdigest().upper(),
)
# TODO: 若响应非成功则流转至人工处理
if not (response.get("status") == 200 and response.get("code") == 0):
raise RuntimeError("请求深圳快瞳通用文本识别接口发生异常")
blocks = []
for block in response["data"]:
# noinspection PyTypeChecker
blocks.append(
[
int(block["itemPolygon"]["x"]), # 文本块左上角的X坐标
int(block["itemPolygon"]["y"]), # 文本块左上角的Y坐标
int(block["itemPolygon"]["height"]), # 文本块左上角的高度
block["value"], # 文本块的文本内容
]
)
# 使用俄罗斯方块方法整理文本块先按照文本块的Y坐标升序从上到下
blocks.sort(key=lambda x: x[1])
lines = []
for idx, block in enumerate(blocks[1:]):
if idx == 0:
line = [blocks[0]]
continue
# 若当前文本块的Y坐标和当前文本行的平均Y坐标差值小于阈值则归为同一文本行否则另起一文本行分行
if (
block[1] - numpy.array([e[1] for e in line]).mean()
< numpy.array([e[2] for e in line]).mean()
):
line.append(block)
else:
lines.append(line)
line = [block]
lines.append(line)
blocks = []
for line in lines:
blocks.extend(
[re.sub(r"\s", "", x[3]) for x in sorted(line, key=lambda x: x[0])]
) # 按照文本块的X坐标升序从左到右并去除文本块的文本内容中所有空字符
return "\n".join(blocks)