Python/票据理赔自动化/1.py

56 lines
2.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import csv
import chardet
def convert_csv_to_utf8(input_csv: str, output_csv: str, encoding: str = None):
"""
将非UTF-8编码的CSV文件转换为UTF-8编码
:param input_csv: 输入CSV文件路径
:param output_csv: 输出UTF-8编码的CSV文件路径
:param encoding: 手动指定输入文件编码(如"gbk"若为None则自动检测
"""
# 步骤1检测输入文件的原始编码若未手动指定
if not encoding:
print(f"🔍 正在检测 {input_csv} 的编码...")
with open(input_csv, "rb") as f:
raw_data = f.read(10240) # 读取前10KB数据用于检测足够识别编码
result = chardet.detect(raw_data)
encoding = result["encoding"]
confidence = result["confidence"]
print(f"✅ 检测到编码:{encoding}(置信度:{confidence:.2f}")
# 处理chardet检测结果为空的情况兜底用gbk适配中文常见编码
if not encoding:
encoding = "gbk"
print(f"⚠️ 编码检测失败,兜底使用 {encoding}")
# 步骤2按原始编码读取CSV并转换为UTF-8保存
try:
# 读取原始CSV处理编码错误replace表示用<E7A4BA>替换无法解码的字符避免程序崩溃
with open(input_csv, "r", encoding=encoding, errors="replace") as infile:
# 兼容CSV的不同分隔符默认逗号若为制表符可改delimiter='\t'
reader = csv.reader(infile)
rows = list(reader) # 读取所有行
# 保存为UTF-8编码的CSVnewline=''避免空行encoding='utf-8-sig'带BOM适配Excel打开
with open(output_csv, "w", encoding="utf-8-sig", newline="") as outfile:
writer = csv.writer(outfile)
writer.writerows(rows)
print(f"✅ 转换完成UTF-8编码文件已保存至{output_csv}")
return True
except Exception as e:
print(f"❌ 转换失败:{str(e)}")
return False
# ========== 示例调用 ==========
if __name__ == "__main__":
# 输入/输出文件路径(替换为你的实际路径)
input_path = "转换后_UTF8.csv"
output_path = "转换后_UTF8.csv"
# 自动检测编码并转换
convert_csv_to_utf8(input_path, output_path)