173 lines
		
	
	
		
			5.5 KiB
		
	
	
	
		
			Python
		
	
	
	
			
		
		
	
	
			173 lines
		
	
	
		
			5.5 KiB
		
	
	
	
		
			Python
		
	
	
	
| # -*- coding: utf-8 -*-
 | |
| 
 | |
| """
 | |
| 普康健康_客服会话记录整合
 | |
| """
 | |
| 
 | |
| # 加载模块
 | |
| 
 | |
| import json
 | |
| import re
 | |
| from pathlib import Path
 | |
| 
 | |
| import pandas
 | |
| from jinja2 import Environment, FileSystemLoader
 | |
| 
 | |
| from utils.client import Authenticator, HTTPClient
 | |
| 
 | |
| # 创建目录地址对象
 | |
| directory_path = Path("客服会话记录")
 | |
| 
 | |
| # 初始化数据体
 | |
| dataframe = pandas.DataFrame()
 | |
| 
 | |
| for file_path in directory_path.glob("*.csv"):
 | |
|     # 读取本地CSV
 | |
|     dataframe = pandas.concat(
 | |
|         [
 | |
|             dataframe,
 | |
|             pandas.read_csv(
 | |
|                 filepath_or_buffer=file_path,
 | |
|                 usecols=["用户名称", "会话开始时间", "详情"],
 | |
|                 dtype=str,
 | |
|                 encoding="gbk",
 | |
|                 encoding_errors="ignore",
 | |
|             ),
 | |
|         ],
 | |
|         ignore_index=True,
 | |
|     )
 | |
| 
 | |
| dataframe = (
 | |
|     dataframe.assign(
 | |
|         会话开始时间=pandas.to_datetime(dataframe["会话开始时间"], errors="coerce"),
 | |
|         # 删除客服导航语、结束语和双换行符
 | |
|         详情=dataframe["详情"].apply(
 | |
|             lambda row: (
 | |
|                 row.split("\n\n", maxsplit=1)[-1]
 | |
|                 if "您好,欢迎咨询小暖,猜您可能咨询以下问题" in row
 | |
|                 else row
 | |
|             )
 | |
|             .replace("对话结束 >>\n", "")
 | |
|             .replace("\n\n", "")
 | |
|         ),
 | |
|     )
 | |
|     # 按照会话开始时间倒序排序
 | |
|     .sort_values(by="会话开始时间", ascending=False)
 | |
|     # 按照用户名称分组并就详情以双换行符拼接
 | |
|     .groupby(by="用户名称", as_index=False).agg(详情=("详情", "\n\n".join))
 | |
| )
 | |
| 
 | |
| # 长文本建议以JSON方式保存
 | |
| 
 | |
| template = Environment(loader=FileSystemLoader(".")).get_template("template.html")
 | |
| 
 | |
| # 初始化认证器
 | |
| Authenticator = Authenticator()
 | |
| 
 | |
| # 初始化HTTP客户端
 | |
| http_client = HTTPClient()
 | |
| 
 | |
| rows = []
 | |
| 
 | |
| for _, row in dataframe.iterrows():
 | |
| 
 | |
|     # 初始化会话列表
 | |
|     conversations = []
 | |
| 
 | |
|     # 以双换行符切割文本,每部分为一个会话
 | |
|     for lines in row["详情"].split("\n\n"):
 | |
| 
 | |
|         # 初始化会话
 | |
|         conversation = {"messages": [], "started_at": ""}
 | |
| 
 | |
|         # 以换行符切割文本,遍历每一行
 | |
|         for i, line in enumerate(lines := lines.split("\n")):
 | |
| 
 | |
|             # 正则匹配行包含发送时间
 | |
|             match = re.search(
 | |
|                 r"(?P<started_at>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})",
 | |
|                 sent_at := line.split(" ", maxsplit=1)[-1],
 | |
|             )
 | |
| 
 | |
|             while match and not conversation["started_at"]:
 | |
|                 # 更新会话开始时间
 | |
|                 conversation["started_at"] = match.group("started_at")
 | |
| 
 | |
|             if match and row["用户名称"] in (name := line.split(" ", maxsplit=1)[0]):
 | |
|                 # 初始化客户信息体
 | |
|                 message = {"sender": "客户", "sender_type": "customer", "content": ""}
 | |
|                 # 若某行匹配发送时间格式,则至下个匹配发送时间格式的中间行为发送内容
 | |
|                 for i_, line_ in enumerate(lines[i + 1 :]):
 | |
|                     if (
 | |
|                         not (
 | |
|                             match_ := re.search(
 | |
|                                 r"(^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})$",
 | |
|                                 line_.split(" ", maxsplit=1)[-1],
 | |
|                             )
 | |
|                         )
 | |
|                         and line_
 | |
|                     ):
 | |
|                         message["content"] += line_
 | |
|                     if match_:
 | |
|                         break
 | |
|                 conversation["messages"].append(message)
 | |
| 
 | |
|             elif match and any(
 | |
|                 x in (name := line.split(" ", maxsplit=1)[0])
 | |
|                 for x in ["机器人", "kefu"]
 | |
|             ):
 | |
|                 # 初始化客服信息体
 | |
|                 # noinspection PyUnboundLocalVariable
 | |
|                 message = {
 | |
|                     "sender": name,
 | |
|                     "sender_type": "staff",
 | |
|                     "content": "",
 | |
|                 }
 | |
|                 for j, line_ in enumerate(lines[i + 1 :]):
 | |
|                     if (
 | |
|                         not (
 | |
|                             match_ := re.search(
 | |
|                                 r"(^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})$",
 | |
|                                 line_.split(" ", maxsplit=1)[-1],
 | |
|                             )
 | |
|                         )
 | |
|                         and line_
 | |
|                     ):
 | |
|                         message["content"] += line_
 | |
|                     if match_:
 | |
|                         break
 | |
|                 conversation["messages"].append(message)
 | |
| 
 | |
|         conversations.append(conversation)
 | |
| 
 | |
|     response = http_client.post(
 | |
|         url="https://kms.7x24cc.com/api/v1/knowledges/",
 | |
|         headers={
 | |
|             "Authorization": "Bearer {}".format(
 | |
|                 Authenticator.get_token(servicer="hlyj")
 | |
|             ),  # 获取访问令牌并装配置请求头
 | |
|             "Content-Type": "application/json; charset=utf-8",
 | |
|         },
 | |
|         json={
 | |
|             "knowledge_base_id": "143bfe7f-fd79-49f2-8359-c123aba944c2",
 | |
|             "title": (
 | |
|                 customer if "_" in (customer := row["用户名称"]) else f"客户{customer}"
 | |
|             ),
 | |
|             "content": template.render(
 | |
|                 {
 | |
|                     "conversations": conversations,
 | |
|                 }
 | |
|             ),  # 根据模版生成HTML
 | |
|         },
 | |
|     )
 | |
| 
 | |
|     print(response)
 | |
| 
 | |
|     # 就响应中知识条目标识更新至行
 | |
|     row["新增结果"] = response.get("knowledge_id", "")
 | |
| 
 | |
|     rows.append(row)
 | |
| 
 | |
| with open("results.json", "w", encoding="utf-8") as file:
 | |
|     json.dump(rows, file, ensure_ascii=False)
 |