123 lines
4.6 KiB
Python
123 lines
4.6 KiB
Python
def general_text_recognize(image) -> str:
|
||
"""
|
||
通用文本识别
|
||
:param image: 影像件
|
||
:return: 识别文本
|
||
"""
|
||
# 请求深圳快瞳通用文本识别接口
|
||
response = http_client.post(
|
||
url=(url := "https://ai.inspirvision.cn/s/api/ocr/general"),
|
||
headers={
|
||
"X-RequestId-Header": image["影像件唯一标识"]
|
||
}, # 以影像件唯一标识作为请求唯一标识,用于双方联查
|
||
data={
|
||
"token": authenticator.get_token(servicer="szkt"), # 获取深圳快瞳访问令牌
|
||
"imgBase64": f"data:image/{image["影像件格式"].lstrip(".")};base64,{image["影像件BASE64编码"]}",
|
||
},
|
||
guid=md5((url + image["影像件唯一标识"]).encode("utf-8")).hexdigest().upper(),
|
||
)
|
||
# TODO: 若响应非成功则流转至人工处理
|
||
if not (response.get("status") == 200 and response.get("code") == 0):
|
||
raise RuntimeError("请求深圳快瞳通用文本识别接口发生异常")
|
||
|
||
blocks = []
|
||
for block in response["data"]:
|
||
# noinspection PyTypeChecker
|
||
blocks.append(
|
||
[
|
||
int(block["itemPolygon"]["x"]), # 文本块左上角的X坐标
|
||
int(block["itemPolygon"]["y"]), # 文本块左上角的Y坐标
|
||
int(block["itemPolygon"]["height"]), # 文本块左上角的高度
|
||
block["value"], # 文本块的文本内容
|
||
]
|
||
)
|
||
# 使用俄罗斯方块方法整理文本块,先按照文本块的Y坐标升序(从上到下)
|
||
blocks.sort(key=lambda x: x[1])
|
||
|
||
lines = []
|
||
for idx, block in enumerate(blocks[1:]):
|
||
if idx == 0:
|
||
line = [blocks[0]]
|
||
continue
|
||
# 若当前文本块的Y坐标和当前文本行的平均Y坐标差值小于阈值则归为同一文本行,否则另起一文本行(分行)
|
||
if (
|
||
block[1] - numpy.array([e[1] for e in line]).mean()
|
||
< numpy.array([e[2] for e in line]).mean()
|
||
):
|
||
line.append(block)
|
||
else:
|
||
lines.append(line)
|
||
line = [block]
|
||
lines.append(line)
|
||
|
||
blocks = []
|
||
for line in lines:
|
||
blocks.extend(
|
||
[re.sub(r"\s", "", x[3]) for x in sorted(line, key=lambda x: x[0])]
|
||
) # 按照文本块的X坐标升序(从左到右)并去除文本块的文本内容中所有空字符
|
||
return "\n".join(blocks)
|
||
|
||
class JiojioTokenizer:
|
||
"""中文分词器"""
|
||
|
||
def __init__(self):
|
||
# 初始化jiojio分词器
|
||
# noinspection PyBroadException
|
||
try:
|
||
jiojio.init()
|
||
except:
|
||
raise RuntimeError("初始化jiojio分词器发生异常")
|
||
|
||
# noinspection PyShadowingNames
|
||
@staticmethod
|
||
def callback(text: str, flags: int, cursor) -> None:
|
||
"""
|
||
分词回调函数
|
||
:param text: 待分词文本
|
||
:param flags: FTS5分词场景标记位
|
||
:param cursor: FTS5分词回传游标
|
||
return 无
|
||
"""
|
||
if not text or not isinstance(text, str):
|
||
return
|
||
|
||
tokens = []
|
||
begin_idx = 0 # 当前分词开始索引
|
||
for word in jiojio.cut(text):
|
||
if word.strip() == "":
|
||
begin_idx += len(word)
|
||
continue
|
||
tokens.append(
|
||
(word, begin_idx, end_idx := begin_idx + len(word))
|
||
) # SQLite FTS5要求回传分词语音文本开始和结束索引
|
||
begin_idx = end_idx
|
||
|
||
for token, begin_idx, end_idx in tokens:
|
||
cursor.send((token, begin_idx, end_idx))
|
||
|
||
# 实例化jiojio分词器
|
||
self.threads.jiojio_tokenizer = self.JiojioTokenizer()
|
||
|
||
# 创建分词器方法
|
||
def create_tokenizer_module(tokenizer):
|
||
class JiojioTokenizerModule:
|
||
"""创建jiojio分词器方法"""
|
||
|
||
# noinspection PyShadowingNames
|
||
@staticmethod
|
||
def tokenize(text: str, flags: int, cursor) -> None:
|
||
tokenizer.callback(text, flags, cursor)
|
||
|
||
return JiojioTokenizerModule()
|
||
|
||
self.threads.connection.create_module(
|
||
"jiojio_fts5_module",
|
||
create_tokenizer_module(self.threads.jiojio_tokenizer),
|
||
)
|
||
|
||
self.threads.connection.execute(
|
||
"""
|
||
CREATE VIRTUAL TABLE IF NOT EXISTS jiojio_tokenizer USING fts5tokenizer(jiojio_fts5_module)
|
||
"""
|
||
)
|