def general_text_recognize(image) -> str: """ 通用文本识别 :param image: 影像件 :return: 识别文本 """ # 请求深圳快瞳通用文本识别接口 response = http_client.post( url=(url := "https://ai.inspirvision.cn/s/api/ocr/general"), headers={ "X-RequestId-Header": image["影像件唯一标识"] }, # 以影像件唯一标识作为请求唯一标识,用于双方联查 data={ "token": authenticator.get_token(servicer="szkt"), # 获取深圳快瞳访问令牌 "imgBase64": f"data:image/{image["影像件格式"].lstrip(".")};base64,{image["影像件BASE64编码"]}", }, guid=md5((url + image["影像件唯一标识"]).encode("utf-8")).hexdigest().upper(), ) # TODO: 若响应非成功则流转至人工处理 if not (response.get("status") == 200 and response.get("code") == 0): raise RuntimeError("请求深圳快瞳通用文本识别接口发生异常") blocks = [] for block in response["data"]: # noinspection PyTypeChecker blocks.append( [ int(block["itemPolygon"]["x"]), # 文本块左上角的X坐标 int(block["itemPolygon"]["y"]), # 文本块左上角的Y坐标 int(block["itemPolygon"]["height"]), # 文本块左上角的高度 block["value"], # 文本块的文本内容 ] ) # 使用俄罗斯方块方法整理文本块,先按照文本块的Y坐标升序(从上到下) blocks.sort(key=lambda x: x[1]) lines = [] for idx, block in enumerate(blocks[1:]): if idx == 0: line = [blocks[0]] continue # 若当前文本块的Y坐标和当前文本行的平均Y坐标差值小于阈值则归为同一文本行,否则另起一文本行(分行) if ( block[1] - numpy.array([e[1] for e in line]).mean() < numpy.array([e[2] for e in line]).mean() ): line.append(block) else: lines.append(line) line = [block] lines.append(line) blocks = [] for line in lines: blocks.extend( [re.sub(r"\s", "", x[3]) for x in sorted(line, key=lambda x: x[0])] ) # 按照文本块的X坐标升序(从左到右)并去除文本块的文本内容中所有空字符 return "\n".join(blocks) class JiojioTokenizer: """中文分词器""" def __init__(self): # 初始化jiojio分词器 # noinspection PyBroadException try: jiojio.init() except: raise RuntimeError("初始化jiojio分词器发生异常") # noinspection PyShadowingNames @staticmethod def callback(text: str, flags: int, cursor) -> None: """ 分词回调函数 :param text: 待分词文本 :param flags: FTS5分词场景标记位 :param cursor: FTS5分词回传游标 return 无 """ if not text or not isinstance(text, str): return tokens = [] begin_idx = 0 # 当前分词开始索引 for word in jiojio.cut(text): if word.strip() == "": begin_idx += len(word) continue tokens.append( (word, begin_idx, end_idx := begin_idx + len(word)) ) # SQLite FTS5要求回传分词语音文本开始和结束索引 begin_idx = end_idx for token, begin_idx, end_idx in tokens: cursor.send((token, begin_idx, end_idx)) # 实例化jiojio分词器 self.threads.jiojio_tokenizer = self.JiojioTokenizer() # 创建分词器方法 def create_tokenizer_module(tokenizer): class JiojioTokenizerModule: """创建jiojio分词器方法""" # noinspection PyShadowingNames @staticmethod def tokenize(text: str, flags: int, cursor) -> None: tokenizer.callback(text, flags, cursor) return JiojioTokenizerModule() self.threads.connection.create_module( "jiojio_fts5_module", create_tokenizer_module(self.threads.jiojio_tokenizer), ) self.threads.connection.execute( """ CREATE VIRTUAL TABLE IF NOT EXISTS jiojio_tokenizer USING fts5tokenizer(jiojio_fts5_module) """ )