56 lines
2.3 KiB
Python
56 lines
2.3 KiB
Python
import csv
|
||
|
||
import chardet
|
||
|
||
|
||
def convert_csv_to_utf8(input_csv: str, output_csv: str, encoding: str = None):
|
||
"""
|
||
将非UTF-8编码的CSV文件转换为UTF-8编码
|
||
:param input_csv: 输入CSV文件路径
|
||
:param output_csv: 输出UTF-8编码的CSV文件路径
|
||
:param encoding: 手动指定输入文件编码(如"gbk"),若为None则自动检测
|
||
"""
|
||
# 步骤1:检测输入文件的原始编码(若未手动指定)
|
||
if not encoding:
|
||
print(f"🔍 正在检测 {input_csv} 的编码...")
|
||
with open(input_csv, "rb") as f:
|
||
raw_data = f.read(10240) # 读取前10KB数据用于检测(足够识别编码)
|
||
result = chardet.detect(raw_data)
|
||
encoding = result["encoding"]
|
||
confidence = result["confidence"]
|
||
print(f"✅ 检测到编码:{encoding}(置信度:{confidence:.2f})")
|
||
# 处理chardet检测结果为空的情况(兜底用gbk,适配中文常见编码)
|
||
if not encoding:
|
||
encoding = "gbk"
|
||
print(f"⚠️ 编码检测失败,兜底使用 {encoding}")
|
||
|
||
# 步骤2:按原始编码读取CSV并转换为UTF-8保存
|
||
try:
|
||
# 读取原始CSV(处理编码错误:replace表示用<E7A4BA>替换无法解码的字符,避免程序崩溃)
|
||
with open(input_csv, "r", encoding=encoding, errors="replace") as infile:
|
||
# 兼容CSV的不同分隔符(默认逗号,若为制表符可改delimiter='\t')
|
||
reader = csv.reader(infile)
|
||
rows = list(reader) # 读取所有行
|
||
|
||
# 保存为UTF-8编码的CSV(newline=''避免空行,encoding='utf-8-sig'带BOM,适配Excel打开)
|
||
with open(output_csv, "w", encoding="utf-8-sig", newline="") as outfile:
|
||
writer = csv.writer(outfile)
|
||
writer.writerows(rows)
|
||
|
||
print(f"✅ 转换完成!UTF-8编码文件已保存至:{output_csv}")
|
||
return True
|
||
|
||
except Exception as e:
|
||
print(f"❌ 转换失败:{str(e)}")
|
||
return False
|
||
|
||
|
||
# ========== 示例调用 ==========
|
||
if __name__ == "__main__":
|
||
# 输入/输出文件路径(替换为你的实际路径)
|
||
input_path = "转换后_UTF8.csv"
|
||
output_path = "转换后_UTF8.csv"
|
||
|
||
# 自动检测编码并转换
|
||
convert_csv_to_utf8(input_path, output_path)
|