Python/普康健康客服会话记录整合/main.py

# -*- coding: utf-8 -*-

"""
普康健康_客服会话记录整合
"""

# 加载模块

import json
import re
from pathlib import Path

import pandas
from jinja2 import Environment, FileSystemLoader

from utils.client import Authenticator, HTTPClient

# 创建目录地址对象
directory_path = Path("客服会话记录")

# 初始化数据体
dataframe = pandas.DataFrame()

for file_path in directory_path.glob("*.csv"):
    # 读取本地CSV
    dataframe = pandas.concat(
        [
            dataframe,
            pandas.read_csv(
                filepath_or_buffer=file_path,
                usecols=["用户名称", "会话开始时间", "详情"],
                dtype=str,
                encoding="gbk",
                encoding_errors="ignore",
            ),
        ],
        ignore_index=True,
    )

dataframe = (
    dataframe.assign(
        会话开始时间=pandas.to_datetime(dataframe["会话开始时间"], errors="coerce"),
        # 删除客服导航语、结束语和双换行符
        详情=dataframe["详情"].apply(
            lambda row: (
                row.split("\n\n", maxsplit=1)[-1]
                if "您好，欢迎咨询小暖，猜您可能咨询以下问题" in row
                else row
            )
            .replace("对话结束 >>\n", "")
            .replace("\n\n", "")
        ),
    )
    # 按照会话开始时间倒序排序
    .sort_values(by="会话开始时间", ascending=False)
    # 按照用户名称分组并就详情以双换行符拼接
    .groupby(by="用户名称", as_index=False).agg(详情=("详情", "\n\n".join))
)

# 长文本建议以JSON方式保存

template = Environment(loader=FileSystemLoader(".")).get_template("template.html")

# 初始化认证器
Authenticator = Authenticator()

# 初始化HTTP客户端
http_client = HTTPClient()

rows = []

for _, row in dataframe.iterrows():

    # 初始化会话列表
    conversations = []

    # 以双换行符切割文本，每部分为一个会话
    for lines in row["详情"].split("\n\n"):

        # 初始化会话
        conversation = {"messages": [], "started_at": ""}

        # 以换行符切割文本，遍历每一行
        for i, line in enumerate(lines := lines.split("\n")):

            # 正则匹配行包含发送时间
            match = re.search(
                r"(?P<started_at>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})",
                sent_at := line.split(" ", maxsplit=1)[-1],
            )

            while match and not conversation["started_at"]:
                # 更新会话开始时间
                conversation["started_at"] = match.group("started_at")

            if match and row["用户名称"] in (name := line.split(" ", maxsplit=1)[0]):
                # 初始化客户信息体
                message = {"sender": "客户", "sender_type": "customer", "content": ""}
                # 若某行匹配发送时间格式，则至下个匹配发送时间格式的中间行为发送内容
                for i_, line_ in enumerate(lines[i + 1 :]):
                    if (
                        not (
                            match_ := re.search(
                                r"(^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})$",
                                line_.split(" ", maxsplit=1)[-1],
                            )
                        )
                        and line_
                    ):
                        message["content"] += line_
                    if match_:
                        break
                conversation["messages"].append(message)

            elif match and any(
                x in (name := line.split(" ", maxsplit=1)[0])
                for x in ["机器人", "kefu"]
            ):
                # 初始化客服信息体
                # noinspection PyUnboundLocalVariable
                message = {
                    "sender": name,
                    "sender_type": "staff",
                    "content": "",
                }
                for j, line_ in enumerate(lines[i + 1 :]):
                    if (
                        not (
                            match_ := re.search(
                                r"(^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})$",
                                line_.split(" ", maxsplit=1)[-1],
                            )
                        )
                        and line_
                    ):
                        message["content"] += line_
                    if match_:
                        break
                conversation["messages"].append(message)

        conversations.append(conversation)

    response = http_client.post(
        url="https://kms.7x24cc.com/api/v1/knowledges/",
        headers={
            "Authorization": "Bearer {}".format(
                Authenticator.get_token(servicer="hlyj")
            ),  # 获取访问令牌并装配置请求头
            "Content-Type": "application/json; charset=utf-8",
        },
        json={
            "knowledge_base_id": "143bfe7f-fd79-49f2-8359-c123aba944c2",
            "title": (
                customer if "_" in (customer := row["用户名称"]) else f"客户{customer}"
            ),
            "content": template.render(
                {
                    "conversations": conversations,
                }
            ),  # 根据模版生成HTML
        },
    )

    print(response)

    # 就响应中知识条目标识更新至行
    row["新增结果"] = response.get("knowledge_id", "")

    rows.append(row)

with open("results.json", "w", encoding="utf-8") as file:
    json.dump(rows, file, ensure_ascii=False)