# -*- coding: utf-8 -*- """ 普康健康_客服会话记录整合 """ # 加载模块 import json import re from pathlib import Path import pandas from jinja2 import Environment, FileSystemLoader from utils.client import Authenticator, HTTPClient # 创建目录地址对象 directory_path = Path("客服会话记录") # 初始化数据体 dataframe = pandas.DataFrame() for file_path in directory_path.glob("*.csv"): # 读取本地CSV dataframe = pandas.concat( [ dataframe, pandas.read_csv( filepath_or_buffer=file_path, usecols=["用户名称", "会话开始时间", "详情"], dtype=str, encoding="gbk", encoding_errors="ignore", ), ], ignore_index=True, ) dataframe = ( dataframe.assign( 会话开始时间=pandas.to_datetime(dataframe["会话开始时间"], errors="coerce"), # 删除客服导航语、结束语和双换行符 详情=dataframe["详情"].apply( lambda row: ( row.split("\n\n", maxsplit=1)[-1] if "您好,欢迎咨询小暖,猜您可能咨询以下问题" in row else row ) .replace("对话结束 >>\n", "") .replace("\n\n", "") ), ) # 按照会话开始时间倒序排序 .sort_values(by="会话开始时间", ascending=False) # 按照用户名称分组并就详情以双换行符拼接 .groupby(by="用户名称", as_index=False).agg(详情=("详情", "\n\n".join)) ) # 长文本建议以JSON方式保存 template = Environment(loader=FileSystemLoader(".")).get_template("template.html") # 初始化认证器 Authenticator = Authenticator() # 初始化HTTP客户端 http_client = HTTPClient() rows = [] for _, row in dataframe.iterrows(): # 初始化会话列表 conversations = [] # 以双换行符切割文本,每部分为一个会话 for lines in row["详情"].split("\n\n"): # 初始化会话 conversation = {"messages": [], "started_at": ""} # 以换行符切割文本,遍历每一行 for i, line in enumerate(lines := lines.split("\n")): # 正则匹配行包含发送时间 match = re.search( r"(?P\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})", sent_at := line.split(" ", maxsplit=1)[-1], ) while match and not conversation["started_at"]: # 更新会话开始时间 conversation["started_at"] = match.group("started_at") if match and row["用户名称"] in (name := line.split(" ", maxsplit=1)[0]): # 初始化客户信息体 message = {"sender": "客户", "sender_type": "customer", "content": ""} # 若某行匹配发送时间格式,则至下个匹配发送时间格式的中间行为发送内容 for i_, line_ in enumerate(lines[i + 1 :]): if ( not ( match_ := re.search( r"(^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})$", line_.split(" ", maxsplit=1)[-1], ) ) and line_ ): message["content"] += line_ if match_: break conversation["messages"].append(message) elif match and any( x in (name := line.split(" ", maxsplit=1)[0]) for x in ["机器人", "kefu"] ): # 初始化客服信息体 # noinspection PyUnboundLocalVariable message = { "sender": name, "sender_type": "staff", "content": "", } for j, line_ in enumerate(lines[i + 1 :]): if ( not ( match_ := re.search( r"(^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})$", line_.split(" ", maxsplit=1)[-1], ) ) and line_ ): message["content"] += line_ if match_: break conversation["messages"].append(message) conversations.append(conversation) response = http_client.post( url="https://kms.7x24cc.com/api/v1/knowledges/", headers={ "Authorization": "Bearer {}".format( Authenticator.get_token(servicer="hlyj") ), # 获取访问令牌并装配置请求头 "Content-Type": "application/json; charset=utf-8", }, json={ "knowledge_base_id": "143bfe7f-fd79-49f2-8359-c123aba944c2", "title": ( customer if "_" in (customer := row["用户名称"]) else f"客户{customer}" ), "content": template.render( { "conversations": conversations, } ), # 根据模版生成HTML }, ) print(response) # 就响应中知识条目标识更新至行 row["新增结果"] = response.get("knowledge_id", "") rows.append(row) with open("results.json", "w", encoding="utf-8") as file: json.dump(rows, file, ensure_ascii=False)