import csv import chardet def convert_csv_to_utf8(input_csv: str, output_csv: str, encoding: str = None): """ 将非UTF-8编码的CSV文件转换为UTF-8编码 :param input_csv: 输入CSV文件路径 :param output_csv: 输出UTF-8编码的CSV文件路径 :param encoding: 手动指定输入文件编码(如"gbk"),若为None则自动检测 """ # 步骤1:检测输入文件的原始编码(若未手动指定) if not encoding: print(f"🔍 正在检测 {input_csv} 的编码...") with open(input_csv, "rb") as f: raw_data = f.read(10240) # 读取前10KB数据用于检测(足够识别编码) result = chardet.detect(raw_data) encoding = result["encoding"] confidence = result["confidence"] print(f"✅ 检测到编码:{encoding}(置信度:{confidence:.2f})") # 处理chardet检测结果为空的情况(兜底用gbk,适配中文常见编码) if not encoding: encoding = "gbk" print(f"⚠️ 编码检测失败,兜底使用 {encoding}") # 步骤2:按原始编码读取CSV并转换为UTF-8保存 try: # 读取原始CSV(处理编码错误:replace表示用�替换无法解码的字符,避免程序崩溃) with open(input_csv, "r", encoding=encoding, errors="replace") as infile: # 兼容CSV的不同分隔符(默认逗号,若为制表符可改delimiter='\t') reader = csv.reader(infile) rows = list(reader) # 读取所有行 # 保存为UTF-8编码的CSV(newline=''避免空行,encoding='utf-8-sig'带BOM,适配Excel打开) with open(output_csv, "w", encoding="utf-8-sig", newline="") as outfile: writer = csv.writer(outfile) writer.writerows(rows) print(f"✅ 转换完成!UTF-8编码文件已保存至:{output_csv}") return True except Exception as e: print(f"❌ 转换失败:{str(e)}") return False # ========== 示例调用 ========== if __name__ == "__main__": # 输入/输出文件路径(替换为你的实际路径) input_path = "转换后_UTF8.csv" output_path = "转换后_UTF8.csv" # 自动检测编码并转换 convert_csv_to_utf8(input_path, output_path)