173 lines
5.5 KiB
Python
173 lines
5.5 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
普康健康_客服会话记录整合
|
|
"""
|
|
|
|
# 加载模块
|
|
|
|
import json
|
|
import re
|
|
from pathlib import Path
|
|
|
|
import pandas
|
|
from jinja2 import Environment, FileSystemLoader
|
|
|
|
from utils.client import Authenticator, HTTPClient
|
|
|
|
# 创建目录地址对象
|
|
directory_path = Path("客服会话记录")
|
|
|
|
# 初始化数据体
|
|
dataframe = pandas.DataFrame()
|
|
|
|
for file_path in directory_path.glob("*.csv"):
|
|
# 读取本地CSV
|
|
dataframe = pandas.concat(
|
|
[
|
|
dataframe,
|
|
pandas.read_csv(
|
|
filepath_or_buffer=file_path,
|
|
usecols=["用户名称", "会话开始时间", "详情"],
|
|
dtype=str,
|
|
encoding="gbk",
|
|
encoding_errors="ignore",
|
|
),
|
|
],
|
|
ignore_index=True,
|
|
)
|
|
|
|
dataframe = (
|
|
dataframe.assign(
|
|
会话开始时间=pandas.to_datetime(dataframe["会话开始时间"], errors="coerce"),
|
|
# 删除客服导航语、结束语和双换行符
|
|
详情=dataframe["详情"].apply(
|
|
lambda row: (
|
|
row.split("\n\n", maxsplit=1)[-1]
|
|
if "您好,欢迎咨询小暖,猜您可能咨询以下问题" in row
|
|
else row
|
|
)
|
|
.replace("对话结束 >>\n", "")
|
|
.replace("\n\n", "")
|
|
),
|
|
)
|
|
# 按照会话开始时间倒序排序
|
|
.sort_values(by="会话开始时间", ascending=False)
|
|
# 按照用户名称分组并就详情以双换行符拼接
|
|
.groupby(by="用户名称", as_index=False).agg(详情=("详情", "\n\n".join))
|
|
)
|
|
|
|
# 长文本建议以JSON方式保存
|
|
|
|
template = Environment(loader=FileSystemLoader(".")).get_template("template.html")
|
|
|
|
# 初始化认证器
|
|
Authenticator = Authenticator()
|
|
|
|
# 初始化HTTP客户端
|
|
http_client = HTTPClient()
|
|
|
|
rows = []
|
|
|
|
for _, row in dataframe.iterrows():
|
|
|
|
# 初始化会话列表
|
|
conversations = []
|
|
|
|
# 以双换行符切割文本,每部分为一个会话
|
|
for lines in row["详情"].split("\n\n"):
|
|
|
|
# 初始化会话
|
|
conversation = {"messages": [], "started_at": ""}
|
|
|
|
# 以换行符切割文本,遍历每一行
|
|
for i, line in enumerate(lines := lines.split("\n")):
|
|
|
|
# 正则匹配行包含发送时间
|
|
match = re.search(
|
|
r"(?P<started_at>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})",
|
|
sent_at := line.split(" ", maxsplit=1)[-1],
|
|
)
|
|
|
|
while match and not conversation["started_at"]:
|
|
# 更新会话开始时间
|
|
conversation["started_at"] = match.group("started_at")
|
|
|
|
if match and row["用户名称"] in (name := line.split(" ", maxsplit=1)[0]):
|
|
# 初始化客户信息体
|
|
message = {"sender": "客户", "sender_type": "customer", "content": ""}
|
|
# 若某行匹配发送时间格式,则至下个匹配发送时间格式的中间行为发送内容
|
|
for i_, line_ in enumerate(lines[i + 1 :]):
|
|
if (
|
|
not (
|
|
match_ := re.search(
|
|
r"(^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})$",
|
|
line_.split(" ", maxsplit=1)[-1],
|
|
)
|
|
)
|
|
and line_
|
|
):
|
|
message["content"] += line_
|
|
if match_:
|
|
break
|
|
conversation["messages"].append(message)
|
|
|
|
elif match and any(
|
|
x in (name := line.split(" ", maxsplit=1)[0])
|
|
for x in ["机器人", "kefu"]
|
|
):
|
|
# 初始化客服信息体
|
|
# noinspection PyUnboundLocalVariable
|
|
message = {
|
|
"sender": name,
|
|
"sender_type": "staff",
|
|
"content": "",
|
|
}
|
|
for j, line_ in enumerate(lines[i + 1 :]):
|
|
if (
|
|
not (
|
|
match_ := re.search(
|
|
r"(^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})$",
|
|
line_.split(" ", maxsplit=1)[-1],
|
|
)
|
|
)
|
|
and line_
|
|
):
|
|
message["content"] += line_
|
|
if match_:
|
|
break
|
|
conversation["messages"].append(message)
|
|
|
|
conversations.append(conversation)
|
|
|
|
response = http_client.post(
|
|
url="https://kms.7x24cc.com/api/v1/knowledges/",
|
|
headers={
|
|
"Authorization": "Bearer {}".format(
|
|
Authenticator.get_token(servicer="hlyj")
|
|
), # 获取访问令牌并装配置请求头
|
|
"Content-Type": "application/json; charset=utf-8",
|
|
},
|
|
json={
|
|
"knowledge_base_id": "143bfe7f-fd79-49f2-8359-c123aba944c2",
|
|
"title": (
|
|
customer if "_" in (customer := row["用户名称"]) else f"客户{customer}"
|
|
),
|
|
"content": template.render(
|
|
{
|
|
"conversations": conversations,
|
|
}
|
|
), # 根据模版生成HTML
|
|
},
|
|
)
|
|
|
|
print(response)
|
|
|
|
# 就响应中知识条目标识更新至行
|
|
row["新增结果"] = response.get("knowledge_id", "")
|
|
|
|
rows.append(row)
|
|
|
|
with open("results.json", "w", encoding="utf-8") as file:
|
|
json.dump(rows, file, ensure_ascii=False)
|