Python/普康健康客服会话记录整合/main.py

173 lines
5.5 KiB
Python

# -*- coding: utf-8 -*-
"""
普康健康_客服会话记录整合
"""
# 加载模块
import json
import re
from pathlib import Path
import pandas
from jinja2 import Environment, FileSystemLoader
from utils.client import Authenticator, HTTPClient
# 创建目录地址对象
directory_path = Path("客服会话记录")
# 初始化数据体
dataframe = pandas.DataFrame()
for file_path in directory_path.glob("*.csv"):
# 读取本地CSV
dataframe = pandas.concat(
[
dataframe,
pandas.read_csv(
filepath_or_buffer=file_path,
usecols=["用户名称", "会话开始时间", "详情"],
dtype=str,
encoding="gbk",
encoding_errors="ignore",
),
],
ignore_index=True,
)
dataframe = (
dataframe.assign(
会话开始时间=pandas.to_datetime(dataframe["会话开始时间"], errors="coerce"),
# 删除客服导航语、结束语和双换行符
详情=dataframe["详情"].apply(
lambda row: (
row.split("\n\n", maxsplit=1)[-1]
if "您好,欢迎咨询小暖,猜您可能咨询以下问题" in row
else row
)
.replace("对话结束 >>\n", "")
.replace("\n\n", "")
),
)
# 按照会话开始时间倒序排序
.sort_values(by="会话开始时间", ascending=False)
# 按照用户名称分组并就详情以双换行符拼接
.groupby(by="用户名称", as_index=False).agg(详情=("详情", "\n\n".join))
)
# 长文本建议以JSON方式保存
template = Environment(loader=FileSystemLoader(".")).get_template("template.html")
# 初始化认证器
Authenticator = Authenticator()
# 初始化HTTP客户端
http_client = HTTPClient()
rows = []
for _, row in dataframe.iterrows():
# 初始化会话列表
conversations = []
# 以双换行符切割文本,每部分为一个会话
for lines in row["详情"].split("\n\n"):
# 初始化会话
conversation = {"messages": [], "started_at": ""}
# 以换行符切割文本,遍历每一行
for i, line in enumerate(lines := lines.split("\n")):
# 正则匹配行包含发送时间
match = re.search(
r"(?P<started_at>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})",
sent_at := line.split(" ", maxsplit=1)[-1],
)
while match and not conversation["started_at"]:
# 更新会话开始时间
conversation["started_at"] = match.group("started_at")
if match and row["用户名称"] in (name := line.split(" ", maxsplit=1)[0]):
# 初始化客户信息体
message = {"sender": "客户", "sender_type": "customer", "content": ""}
# 若某行匹配发送时间格式,则至下个匹配发送时间格式的中间行为发送内容
for i_, line_ in enumerate(lines[i + 1 :]):
if (
not (
match_ := re.search(
r"(^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})$",
line_.split(" ", maxsplit=1)[-1],
)
)
and line_
):
message["content"] += line_
if match_:
break
conversation["messages"].append(message)
elif match and any(
x in (name := line.split(" ", maxsplit=1)[0])
for x in ["机器人", "kefu"]
):
# 初始化客服信息体
# noinspection PyUnboundLocalVariable
message = {
"sender": name,
"sender_type": "staff",
"content": "",
}
for j, line_ in enumerate(lines[i + 1 :]):
if (
not (
match_ := re.search(
r"(^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})$",
line_.split(" ", maxsplit=1)[-1],
)
)
and line_
):
message["content"] += line_
if match_:
break
conversation["messages"].append(message)
conversations.append(conversation)
response = http_client.post(
url="https://kms.7x24cc.com/api/v1/knowledges/",
headers={
"Authorization": "Bearer {}".format(
Authenticator.get_token(servicer="hlyj")
), # 获取访问令牌并装配置请求头
"Content-Type": "application/json; charset=utf-8",
},
json={
"knowledge_base_id": "143bfe7f-fd79-49f2-8359-c123aba944c2",
"title": (
customer if "_" in (customer := row["用户名称"]) else f"客户{customer}"
),
"content": template.render(
{
"conversations": conversations,
}
), # 根据模版生成HTML
},
)
print(response)
# 就响应中知识条目标识更新至行
row["新增结果"] = response.get("knowledge_id", "")
rows.append(row)
with open("results.json", "w", encoding="utf-8") as file:
json.dump(rows, file, ensure_ascii=False)