175 lines
6.8 KiB
Python
175 lines
6.8 KiB
Python
def general_text_recognize(image) -> str:
|
||
"""
|
||
通用文本识别
|
||
:param image: 影像件
|
||
:return: 识别文本
|
||
"""
|
||
# 请求深圳快瞳通用文本识别接口
|
||
response = http_client.post(
|
||
url=(url := "https://ai.inspirvision.cn/s/api/ocr/general"),
|
||
headers={
|
||
"X-RequestId-Header": image["影像件唯一标识"]
|
||
}, # 以影像件唯一标识作为请求唯一标识,用于双方联查
|
||
data={
|
||
"token": authenticator.get_token(servicer="szkt"), # 获取深圳快瞳访问令牌
|
||
"imgBase64": f"data:image/{image["影像件格式"].lstrip(".")};base64,{image["影像件BASE64编码"]}",
|
||
},
|
||
guid=md5((url + image["影像件唯一标识"]).encode("utf-8")).hexdigest().upper(),
|
||
)
|
||
# TODO: 若响应非成功则流转至人工处理
|
||
if not (response.get("status") == 200 and response.get("code") == 0):
|
||
raise RuntimeError("请求深圳快瞳通用文本识别接口发生异常")
|
||
|
||
blocks = []
|
||
for block in response["data"]:
|
||
# noinspection PyTypeChecker
|
||
blocks.append(
|
||
[
|
||
int(block["itemPolygon"]["x"]), # 文本块左上角的X坐标
|
||
int(block["itemPolygon"]["y"]), # 文本块左上角的Y坐标
|
||
int(block["itemPolygon"]["height"]), # 文本块左上角的高度
|
||
block["value"], # 文本块的文本内容
|
||
]
|
||
)
|
||
# 使用俄罗斯方块方法整理文本块,先按照文本块的Y坐标升序(从上到下)
|
||
blocks.sort(key=lambda x: x[1])
|
||
|
||
lines = []
|
||
for idx, block in enumerate(blocks[1:]):
|
||
if idx == 0:
|
||
line = [blocks[0]]
|
||
continue
|
||
# 若当前文本块的Y坐标和当前文本行的平均Y坐标差值小于阈值则归为同一文本行,否则另起一文本行(分行)
|
||
if (
|
||
block[1] - numpy.array([e[1] for e in line]).mean()
|
||
< numpy.array([e[2] for e in line]).mean()
|
||
):
|
||
line.append(block)
|
||
else:
|
||
lines.append(line)
|
||
line = [block]
|
||
lines.append(line)
|
||
|
||
blocks = []
|
||
for line in lines:
|
||
blocks.extend(
|
||
[re.sub(r"\s", "", x[3]) for x in sorted(line, key=lambda x: x[0])]
|
||
) # 按照文本块的X坐标升序(从左到右)并去除文本块的文本内容中所有空字符
|
||
return "\n".join(blocks)
|
||
|
||
class JiojioTokenizer:
|
||
"""中文分词器"""
|
||
|
||
def __init__(self):
|
||
# 初始化jiojio分词器
|
||
# noinspection PyBroadException
|
||
try:
|
||
jiojio.init()
|
||
except:
|
||
raise RuntimeError("初始化jiojio分词器发生异常")
|
||
|
||
# noinspection PyShadowingNames
|
||
@staticmethod
|
||
def callback(text: str, flags: int, cursor) -> None:
|
||
"""
|
||
分词回调函数
|
||
:param text: 待分词文本
|
||
:param flags: FTS5分词场景标记位
|
||
:param cursor: FTS5分词回传游标
|
||
return 无
|
||
"""
|
||
if not text or not isinstance(text, str):
|
||
return
|
||
|
||
tokens = []
|
||
begin_idx = 0 # 当前分词开始索引
|
||
for word in jiojio.cut(text):
|
||
if word.strip() == "":
|
||
begin_idx += len(word)
|
||
continue
|
||
tokens.append(
|
||
(word, begin_idx, end_idx := begin_idx + len(word))
|
||
) # SQLite FTS5要求回传分词语音文本开始和结束索引
|
||
begin_idx = end_idx
|
||
|
||
for token, begin_idx, end_idx in tokens:
|
||
cursor.send((token, begin_idx, end_idx))
|
||
|
||
# 实例化jiojio分词器
|
||
self.threads.jiojio_tokenizer = self.JiojioTokenizer()
|
||
|
||
# 创建分词器方法
|
||
def create_tokenizer_module(tokenizer):
|
||
class JiojioTokenizerModule:
|
||
"""创建jiojio分词器方法"""
|
||
|
||
# noinspection PyShadowingNames
|
||
@staticmethod
|
||
def tokenize(text: str, flags: int, cursor) -> None:
|
||
tokenizer.callback(text, flags, cursor)
|
||
|
||
return JiojioTokenizerModule()
|
||
|
||
self.threads.connection.create_module(
|
||
"jiojio_fts5_module",
|
||
create_tokenizer_module(self.threads.jiojio_tokenizer),
|
||
)
|
||
|
||
self.threads.connection.execute(
|
||
"""
|
||
CREATE VIRTUAL TABLE IF NOT EXISTS jiojio_tokenizer USING fts5tokenizer(jiojio_fts5_module)
|
||
"""
|
||
)
|
||
|
||
|
||
{
|
||
"code": 0,
|
||
"status": 200,
|
||
"message": "success",
|
||
"serialNo": "3a08935648632621760512",
|
||
"data": [
|
||
{"desc": "金额", "value": "175.22"},
|
||
{
|
||
"desc": "项目名称",
|
||
"value": "*化学药品制剂*[海露]玻璃酸钠滴眼液0.1%*10ml支/盒",
|
||
},
|
||
{"desc": "数量", "value": "2"},
|
||
{"desc": "规格型号", "value": ""},
|
||
{"desc": "税额", "value": "22.78"},
|
||
{"desc": "税率", "value": "13%"},
|
||
{"desc": "单位", "value": ""},
|
||
{"desc": "单价", "value": "87.61"},
|
||
{"desc": "金额1", "value": "-69.42"},
|
||
{
|
||
"desc": "项目名称1",
|
||
"value": "*化学药品制剂*[海露]玻璃酸钠滴眼液0.1%*10ml/支/盒",
|
||
},
|
||
{"desc": "数量1", "value": ""},
|
||
{"desc": "规格型号1", "value": ""},
|
||
{"desc": "税额1", "value": "-9.02"},
|
||
{"desc": "税率1", "value": "13%"},
|
||
{"desc": "单位1", "value": ""},
|
||
{"desc": "单价1", "value": ""},
|
||
{"desc": "发票名称", "value": "电子发票(普通发票)"},
|
||
{"desc": "全电票标签", "value": ""},
|
||
{"desc": "发票号码", "value": "25447200000045325946"},
|
||
{"desc": "开票日期", "value": "2025年01月20日"},
|
||
{"desc": "购买方名称", "value": "唐敏华"},
|
||
{"desc": "购买方识别号", "value": ""},
|
||
{"desc": "销售方名称", "value": "广州美团大药房有限公司"},
|
||
{"desc": "销售方识别号", "value": "91440100MAC1CAJH27"},
|
||
{"desc": "合计金额", "value": "¥105.80"},
|
||
{"desc": "合计税额", "value": "¥13.76"},
|
||
{"desc": "金额小计", "value": ""},
|
||
{"desc": "税额小计", "value": ""},
|
||
{"desc": "价税合计(大写)", "value": "壹佰壹拾玖圆伍角陆分"},
|
||
{"desc": "小写金额", "value": "¥119.56"},
|
||
{"desc": "备注", "value": ""},
|
||
{"desc": "开票人", "value": "张景景"},
|
||
{"desc": "发票类型", "value": "电子发票(普通发票)"},
|
||
{"desc": "监制章存在性判断", "value": "True"},
|
||
{"desc": "总页数", "value": ""},
|
||
{"desc": "当前页数", "value": ""},
|
||
],
|
||
}
|