添加目录和文件

This commit is contained in:
marslbr 2025-10-29 12:54:32 +08:00
parent 286d818f34
commit eb1e20813b
7 changed files with 3428 additions and 0 deletions

8
.idea/Python.iml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.11 virtualenv at ~/private" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/Python.iml" filepath="$PROJECT_DIR$/.idea/Python.iml" />
</modules>
</component>
</project>

471
utils/ocr.py Normal file
View File

@ -0,0 +1,471 @@
# -*- coding: utf-8 -*-
# 导入模块
import warnings
# 过滤使用提醒
warnings.filterwarnings(
"ignore",
category=UserWarning,
)
from fuzzywuzzy import fuzz
import re
import numpy
import cv2
from decimal import Decimal, ROUND_HALF_UP
from paddleocr import PaddleOCR
"""
封装百度飞桨PADDLEOCR
"""
def fuzzy_match(
target: str, components: list, specify_key: str, return_key: str
) -> str:
"""
根据目标在组成部分列表模糊匹指定键名的键值并返回匹配的组成部分的返回键名的键值
需要匹配的键名的键值
"""
def _get_value(component, keys):
"""根据键名递归获取键值,支持嵌套结构"""
key = keys[0]
if isinstance(component, dict) and key in component:
return (
_get_value(component[key], keys[1:])
if len(keys) > 1
else component[key]
)
return None
results = []
for component in components:
# 在组成部分根据指定键名获取对应键值
specify_value = _get_value(component, specify_key.split("."))
if specify_value is None:
continue
# 在组成部分根据返回键名获取对应键值
return_value = _get_value(component, return_key.split("."))
if return_value is not None:
results.append(
(return_value, fuzz.WRatio(target, specify_value))
) # 基于加权补偿莱文斯坦相似度算法
return max(results, key=lambda x: x[1])[0] if results else None
class PPOCR:
"""OCR客户端"""
def __init__(self):
# 初始化PADDLEOCR
self.ocr_engine = PaddleOCR(
ocr_version="PP-OCRv4",
use_doc_orientation_classify=True,
use_doc_unwarping=True,
use_textline_orientation=True,
)
@staticmethod
def _texts_sort(texts):
"""文本排序"""
texts_merged = []
for texts, coordinates in zip(
texts[0]["rec_texts"], texts[0]["rec_polys"]
): # 默认识别结果仅包含一张影像件
# 合并文本框的X/Y坐标、高度和文本
texts_merged.append(
[
# X坐标
numpy.min(coordinates[:, 0]),
# Y坐标
numpy.min(coordinates[:, 1]),
# 高度
numpy.max(coordinates[:, 1]) - numpy.min(coordinates[:, 1]),
texts,
]
)
# 按照文本框Y坐标升序使用空间坐标算法
texts_merged.sort(key=lambda x: x[1])
texts_sorted = []
for index, text in enumerate(texts_merged[1:]):
if index == 0:
# 初始化当前行
row = [texts_merged[0]]
continue
# 若文本框Y坐标与当前行中最后一个文本框的Y坐标差值小于阈值则归为同一行
# noinspection PyUnboundLocalVariable
# noinspection PyTypeChecker
if (
text[1] - row[-1][1] < numpy.mean([text[2] for text in row]) * 0.5
): # 注意NUMPY.NDARRAY和LIST区别ROW[:, 1]仅适用于NUMPY.NDARRAY故使用列表推导式计算当前行文本框Y坐标和高度
row.append(text)
# 否则按照文本框X坐标就当前行中文本框升序
else:
row_sorted = sorted(row, key=lambda x: x[0])
texts_sorted.extend(row_sorted)
row = [text]
# 按照文本框X坐标就最后一行中文本框升序
row_sorted = sorted(row, key=lambda x: x[0])
texts_sorted.extend(row_sorted)
# 返回排序后文本
return [text_sorted[3] for text_sorted in texts_sorted]
def identity_card_recognition(self, image_path: str) -> dict:
"""居民身份证识别"""
# 读取影像件数据类型为NUMPY.NDARRAY
image = cv2.imread(image_path)
texts = self.ocr_engine.predict(
image,
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=True,
text_rec_score_thresh=0.5,
)
# 文本排序
texts = self._texts_sort(texts)
# 居民身份证模版
result = {
"姓名": "",
"性别": "",
"民族": "",
"出生": "",
"住址": "",
"公民身份号码": "",
"有效期限": "",
"签发机关": "",
}
for text in texts: # 默认只包含一套居民身份证正反面
# 姓名
if not result["姓名"] and "姓名" in text:
result["姓名"] = text.replace("姓名", "").strip()
elif "性别" in text or "民族" in text: # 姓名和民族常同时返回
# 性别
if not result["性别"] and "性别" in text:
result["性别"] = (
text.split("性别")[-1].strip().split("民族")[0].strip()
)
# 民族
if not result["民族"] and "民族" in text:
result["民族"] = text.split("民族")[-1].strip()
# 出生
elif not result["出生"] and "出生" in text:
result["出生"] = text.replace("出生", "").strip()
# 住址
elif "住址" in text or (
(
not any(
keyword in text
for keyword in [
"姓名",
"性别",
"民族",
"出生",
"公民身份号码",
"中华人民共和国",
"居民身份证",
"签发机关",
"有效期限",
]
)
)
and not re.fullmatch(
r"^(\d{4}[.]\d{2}[.]\d{2})$", text.split("-")[0].strip()
)
):
if not result["住址"] and "住址" in text:
result["住址"] = text.replace("住址", "").strip()
if result["住址"] and not "住址" in text:
result["住址"] += text.strip()
# 公民身份号码
elif not result["公民身份号码"] and ("公民身份号码" in text):
result["公民身份号码"] = text.replace("公民身份号码", "").strip()
# 有效期限
elif not result["有效期限"] and (
"有效期限" in text
or re.fullmatch(
r"^(\d{4}[.]\d{2}[.]\d{2})$", text.split("-")[0].strip()
)
):
result["有效期限"] = text.replace("有效期限", "").strip()
# 签发机关
elif not result["签发机关"] and "签发机关" in text:
result["签发机关"] = text.replace("签发机关", "").strip()
return result
def invoice_recognition(self, image_path: str) -> dict:
"""增值税发票识别"""
# 读取影像件数据类型为NUMPY.NDARRAY
image = cv2.imread(image_path)
texts = self.ocr_engine.predict(
image,
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False,
text_rec_score_thresh=0.5,
)
# 文本排序
texts = self._texts_sort(texts)
print(texts)
# 增值税发票模版
result = {
"票据类型": "",
"票据号码": "",
"票据代码": "",
"开票日期": "",
"票据金额": "",
"校验码": "",
"收款方": "",
"付款方": "",
"项目": [],
}
for i, text in enumerate(texts):
if not result["票据类型"] and "电子发票" in text:
result["票据类型"] = "数电发票"
elif not result["票据号码"] and "发票号码" in text:
result["票据号码"] = (
text.replace("发票号码", "")
.replace("", "")
.replace(":", "")
.strip()
)
elif not result["开票日期"] and "开票日期" in text:
result["开票日期"] = (
text.replace("开票日期", "")
.replace("", "")
.replace(":", "")
.strip()
)
elif not result["票据金额"] and "小写" in text:
if re.match(
r"^-?\d+(\.\d+)?$", text.replace("", "¥").split("¥")[-1].strip()
):
result["票据金额"] = text.replace("", "¥").split("¥")[-1].strip()
elif re.match(
r"^-?\d+(\.\d+)?$",
texts[i + 1].replace("", "¥").split("¥")[-1].strip(),
):
result["票据金额"] = (
texts[i + 1].replace("", "¥").split("¥")[-1].strip()
)
elif "名称" in text and not "项目名称" in text:
if not result["付款方"]:
result["付款方"] = (
text.replace("名称", "")
.replace("", "")
.replace(":", "")
.strip()
)
else:
result["收款方"] = (
text.replace("名称", "")
.replace("", "")
.replace(":", "")
.strip()
)
# 项目
items = []
for i, text in enumerate(texts):
# 通过首位为星号定位名称、规格和单位
if text.startswith("*"):
# 项目模版
# noinspection PyDictCreation
item = {
"名称": "",
"规格": "",
"单位": "",
"数量": "",
"单价": "",
"金额": "",
"税率": "",
"税额": "",
}
item["名称"] = text.strip("")
# 若非数值则名称后一项为规格
if not re.match(
r"^-?\d+(\.\d+)?$",
texts[i + 1].replace("%", "").strip(),
):
item["规格"] = texts[i + 1].strip()
# 若非数值则名称后二项为单位
if not re.match(
r"^-?\d+(\.\d+)?$",
texts[i + 2].replace("%", "").strip(),
):
item["单位"] = texts[i + 2].strip()
for j, text_ in enumerate(texts):
# 若内循环索引小于等于外循环索引则跳过
if j <= i:
continue
# 若内循环首位为星号或为小计则将识别结果添加至项目并停止内循环
if j > i and (
text_.startswith("*") or text_ in "小计" or text_ in "合计"
):
items.append(item)
break
# 通过包含百分号定位税率、税额、数量、单价和金额
if "%" in text_ and re.match(
r"^\d+(\.\d+)?$",
texts[j].replace("%", "").strip(),
):
item["税率"] = texts[j].replace("%", "").strip() + "%"
# 税率后一项为税额
if re.match(
r"^-?\d+(\.\d+)?$",
texts[j + 1].strip(),
):
item["税额"] = texts[j + 1].strip()
# 税率前一项为金额
if re.match(
r"^-?\d+(\.\d+)?$",
texts[j - 1].strip(),
):
item["金额"] = texts[j - 1].strip()
# 若金额包含负号,税率前二项为单价、前三项为数量
if not "-" in item["金额"]:
if re.match(
r"^\d+(\.\d+)?$",
texts[j - 2].strip(),
):
item["单价"] = texts[j - 2].strip()
if texts[j - 3].strip().isdigit():
item["数量"] = texts[j - 3].strip()
elif j > i + 2 and not re.match(
r"^-?\d+(\.\d+)?$",
text_.replace("%", "").strip(),
):
item["名称"] += texts[j].strip()
# 数值修正
for item in items:
if (
not item["数量"]
and item["金额"]
and not "-" in item["金额"]
and item["单价"]
):
item["数量"] = (
""
if (
quantity := int(
(Decimal(item["金额"]) / Decimal(item["单价"])).quantize(
Decimal("0"), rounding=ROUND_HALF_UP
)
)
)
== 0
else str(quantity)
)
result["项目"] = items
return result

61
utils/operate.py Normal file
View File

@ -0,0 +1,61 @@
# -*- coding: utf-8 -*-
"""
脚本说明基于MySQLMongoDBRequest和飞书等API封装成常用功能
备注
后续需要考虑优化后续utils中脚本尽可能相互独立
"""
# 导入模块
import json
import pandas
import warnings
import numpy
from pydantic import BaseModel, ValidationError, AfterValidator, Field, HttpUrl
from typing import Optional, Union, Unpack, Literal, Dict, TypedDict, Annotated
from requests_toolbelt import MultipartEncoder
import cv2
from requests import Session, Response
from requests.adapters import HTTPAdapter
from urllib.parse import (
urlparse,
urlsplit,
urlunsplit,
parse_qs,
quote,
quote_plus,
unquote,
urlencode,
)
from urllib.request import Request as request, urlopen
from urllib.util.retry import Retry
from urllib.error import HTTPError
from pymongo import MongoClient
import os
import threading
import time
from functools import wraps

1489
utils/pandas_extension.py Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff