parent
ef3699973a
commit
16d41d8fae
BIN
utils/caches.db
BIN
utils/caches.db
Binary file not shown.
110
票据理赔自动化/main.py
110
票据理赔自动化/main.py
|
|
@ -55,7 +55,7 @@ if __name__ == "__main__":
|
||||||
return ZenEngine({"loader": loader}).get_decision(rule_path.as_posix())
|
return ZenEngine({"loader": loader}).get_decision(rule_path.as_posix())
|
||||||
|
|
||||||
# 影像件识别使能
|
# 影像件识别使能
|
||||||
recognition_enable = rule_engine(Path("rules/影像件识别使能.json"))
|
recognize_enable = rule_engine(Path("rules/影像件识别使能.json"))
|
||||||
|
|
||||||
class MasterData(SQLiteClient):
|
class MasterData(SQLiteClient):
|
||||||
"""主数据"""
|
"""主数据"""
|
||||||
|
|
@ -65,7 +65,7 @@ if __name__ == "__main__":
|
||||||
初始化主数据
|
初始化主数据
|
||||||
"""
|
"""
|
||||||
# 初始化SQLite客户端
|
# 初始化SQLite客户端
|
||||||
super().__init__(database="SQLite.db")
|
super().__init__(database="database.db")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with self:
|
with self:
|
||||||
|
|
@ -465,45 +465,42 @@ if __name__ == "__main__":
|
||||||
if not (response.get("status") == 200 and response.get("code") == 0):
|
if not (response.get("status") == 200 and response.get("code") == 0):
|
||||||
raise RuntimeError("请求深圳快瞳通用文本识别接口发生异常")
|
raise RuntimeError("请求深圳快瞳通用文本识别接口发生异常")
|
||||||
|
|
||||||
boxes = []
|
blocks = []
|
||||||
for box in response["data"]:
|
for block in response["data"]:
|
||||||
# noinspection PyTypeChecker
|
# noinspection PyTypeChecker
|
||||||
boxes.append(
|
blocks.append(
|
||||||
[
|
[
|
||||||
numpy.float64(
|
int(block["itemPolygon"]["x"]), # 文本块左上角的X坐标
|
||||||
box["itemPolygon"]["x"]
|
int(block["itemPolygon"]["y"]), # 文本块左上角的Y坐标
|
||||||
), # 文本标注框左上角的X坐标
|
int(block["itemPolygon"]["height"]), # 文本块左上角的高度
|
||||||
numpy.float64(
|
block["value"], # 文本块的文本内容
|
||||||
box["itemPolygon"]["y"]
|
|
||||||
), # 文本标注框左上角的Y坐标
|
|
||||||
numpy.float64(
|
|
||||||
box["itemPolygon"]["height"]
|
|
||||||
), # 文本标注框左上角的高度
|
|
||||||
box["value"], # 文本标注框的文本
|
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
# 按照文本标注框的Y坐标升序(先从上到下)
|
# 使用俄罗斯方块方法整理文本块,先按照文本块的Y坐标升序(从上到下)
|
||||||
boxes.sort(key=lambda x: x[1])
|
blocks.sort(key=lambda x: x[1])
|
||||||
|
|
||||||
rows = []
|
lines = []
|
||||||
for idx, box in enumerate(boxes[1:]):
|
for idx, block in enumerate(blocks[1:]):
|
||||||
if idx == 0:
|
if idx == 0:
|
||||||
row = [boxes[0]]
|
line = [blocks[0]]
|
||||||
continue
|
continue
|
||||||
# 若文本标注框的Y坐标与当前行的最后一个文本标注框的Y坐标差值小于阈值则归为同一行,否则结束当前行(分行)
|
# 若当前文本块的Y坐标和当前文本行的平均Y坐标差值小于阈值则归为同一文本行,否则另起一文本行(分行)
|
||||||
if box[1] - row[-1][1] < row[-1][2] * 0.5:
|
if (
|
||||||
row.append(box)
|
block[1] - numpy.array([e[1] for e in line]).mean()
|
||||||
|
< numpy.array([e[2] for e in line]).mean()
|
||||||
|
):
|
||||||
|
line.append(block)
|
||||||
else:
|
else:
|
||||||
rows.append(row)
|
lines.append(line)
|
||||||
row = [box]
|
line = [block]
|
||||||
rows.append(row)
|
lines.append(line)
|
||||||
|
|
||||||
boxes = []
|
blocks = []
|
||||||
for row in rows:
|
for line in lines:
|
||||||
boxes.extend(
|
blocks.extend(
|
||||||
[re.sub(r"\s", "", x[3]) for x in sorted(row, key=lambda x: x[0])]
|
[re.sub(r"\s", "", x[3]) for x in sorted(line, key=lambda x: x[0])]
|
||||||
) # 按照文本标注框的X坐标升序(再从左到右)并去除所有空字符
|
) # 按照文本块的X坐标升序(从左到右)并去除文本块的文本内容中所有空字符
|
||||||
return "\n".join(boxes) # 整合
|
return "\n".join(blocks)
|
||||||
|
|
||||||
# TODO: 后续添加居民身份证(国徽面)和居民身份证(头像面)合并
|
# TODO: 后续添加居民身份证(国徽面)和居民身份证(头像面)合并
|
||||||
# noinspection PyShadowingNames
|
# noinspection PyShadowingNames
|
||||||
|
|
@ -608,9 +605,8 @@ if __name__ == "__main__":
|
||||||
:param image: 影像件
|
:param image: 影像件
|
||||||
:return: 空
|
:return: 空
|
||||||
"""
|
"""
|
||||||
# 方法1:先使用深圳快瞳通用文本识别再使用硅基流动中大语言模型结构化,可行但是需要请求二次
|
|
||||||
# 方法2:使用硅基流动中支持OCR的大语言模型
|
|
||||||
# 请求硅基流动的大语言模型接口
|
# 请求硅基流动的大语言模型接口
|
||||||
|
# noinspection PyTypeChecker
|
||||||
response = http_client.post(
|
response = http_client.post(
|
||||||
url="https://api.siliconflow.cn/v1/chat/completions",
|
url="https://api.siliconflow.cn/v1/chat/completions",
|
||||||
headers={
|
headers={
|
||||||
|
|
@ -618,18 +614,41 @@ if __name__ == "__main__":
|
||||||
"Content-Type": "application/json; charset=utf-8",
|
"Content-Type": "application/json; charset=utf-8",
|
||||||
},
|
},
|
||||||
json={
|
json={
|
||||||
"model": "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B", # 通过从DeepSeek-R1-0528模型蒸馏思维链接至Qwen3-8B-Base获得的模型
|
"model": (model := "THUDM/GLM-4.1V-9B-Thinking"),
|
||||||
"messages": [{"role": "user", "content": ""}],
|
"messages": [
|
||||||
"max_tokens": 10240, # 生成文本最大令牌数
|
{
|
||||||
"temperature": 0.2,
|
"role": "user",
|
||||||
"top_p": 0.5,
|
"content": [
|
||||||
"top_k": 20,
|
{
|
||||||
"frequency_penalty": 0.0,
|
"type": "text",
|
||||||
"thinking_budget": 1,
|
"text": (
|
||||||
|
text := "请以JSON字符串的形式输出识别结果"
|
||||||
|
),
|
||||||
},
|
},
|
||||||
guid=md5(prompt.encode("utf-8")).hexdigest().upper(),
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": f"data:image/{image["影像件格式"]};base64,{image["影像件BASE64编码"]}"
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"max_tokens": 8192,
|
||||||
|
"temperature": 0.95,
|
||||||
|
"top_p": 0.7,
|
||||||
|
"top_k": 2,
|
||||||
|
"frequency_penalty": 1.1,
|
||||||
|
"thinking_budget": 8192,
|
||||||
|
},
|
||||||
|
guid=md5((model + text + image["影像件唯一标识"]).encode("utf-8"))
|
||||||
|
.hexdigest()
|
||||||
|
.upper(),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
print(response)
|
||||||
|
exit()
|
||||||
|
|
||||||
recognition = (
|
recognition = (
|
||||||
json.loads(match.group("json"))
|
json.loads(match.group("json"))
|
||||||
if (
|
if (
|
||||||
|
|
@ -648,6 +667,7 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
# 根据保险分公司匹配结构化识别文本方法
|
# 根据保险分公司匹配结构化识别文本方法
|
||||||
match insurer_company:
|
match insurer_company:
|
||||||
|
# 中银保险有限公司
|
||||||
case _ if insurer_company.startswith("中银保险有限公司"):
|
case _ if insurer_company.startswith("中银保险有限公司"):
|
||||||
boc_application_recognize(image)
|
boc_application_recognize(image)
|
||||||
|
|
||||||
|
|
@ -1382,12 +1402,12 @@ if __name__ == "__main__":
|
||||||
)
|
)
|
||||||
|
|
||||||
# 影像件识别使能检查,若影像件不识别则跳过
|
# 影像件识别使能检查,若影像件不识别则跳过
|
||||||
if not recognition_enable.evaluate(
|
if not recognize_enable.evaluate(
|
||||||
{
|
{
|
||||||
"insurer_company": insurer_company,
|
"insurer_company": insurer_company,
|
||||||
"image_type": image["影像件类型"],
|
"image_type": image["影像件类型"],
|
||||||
}
|
}
|
||||||
)["result"]["recognition_enable"]:
|
)["result"]["recognize_enable"]:
|
||||||
return
|
return
|
||||||
|
|
||||||
# 根据影像件类型匹配影像件识别方法
|
# 根据影像件类型匹配影像件识别方法
|
||||||
|
|
|
||||||
|
|
@ -139,7 +139,7 @@
|
||||||
{
|
{
|
||||||
"id": "a2fc744f-930d-43e0-b5cf-824a5928c7f1",
|
"id": "a2fc744f-930d-43e0-b5cf-824a5928c7f1",
|
||||||
"name": "是否识别",
|
"name": "是否识别",
|
||||||
"field": "recognition_enable"
|
"field": "recognize_enable"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"hitPolicy": "first",
|
"hitPolicy": "first",
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue