364 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
			
		
		
	
	
			364 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
| # -*- coding: utf-8 -*-
 | ||
| 
 | ||
| 
 | ||
| """
 | ||
| 基于RFM模型生成数据分析报告
 | ||
| """
 | ||
| 
 | ||
| # 导入模块
 | ||
| 
 | ||
| import statistics
 | ||
| from datetime import datetime
 | ||
| from decimal import Decimal, ROUND_HALF_UP
 | ||
| 
 | ||
| import pandas
 | ||
| from jinja2 import Environment, FileSystemLoader
 | ||
| 
 | ||
| from utils.client import MySQLClient
 | ||
| from utils.pandas_extension import DrawAsHTML
 | ||
| 
 | ||
| 
 | ||
| # 函数说明:根据RFM编码映射为客户分类
 | ||
| def map_classification(r_encoded, f_encoded, m_encoded):
 | ||
|     # 就R、F、M指标构建独热编码并匹配客户分类
 | ||
|     match f"{r_encoded}{f_encoded}{m_encoded}":
 | ||
|         case "000":
 | ||
|             classification = "流失客户"
 | ||
|         case "010":
 | ||
|             classification = "一般维持客户"
 | ||
|         case "100":
 | ||
|             classification = "新客户"
 | ||
|         case "110":
 | ||
|             classification = "潜力客户"
 | ||
|         case "001":
 | ||
|             classification = "重要挽留客户"
 | ||
|         case "101":
 | ||
|             classification = "重要深耕客户"
 | ||
|         case "011":
 | ||
|             classification = "重要唤回客户"
 | ||
|         case "111":
 | ||
|             classification = "重要价值客户"
 | ||
| 
 | ||
|     # noinspection PyUnboundLocalVariable
 | ||
|     return classification
 | ||
| 
 | ||
| 
 | ||
| print("1 加载数据集...", end="")
 | ||
| 
 | ||
| client = MySQLClient(database="data_analysis")
 | ||
| 
 | ||
| dataframe = client.execute_query(
 | ||
|     sql="select 客户ID, 交易金额, 交易日期 from rfm_dataset"
 | ||
| )  # customer_id 客户ID STRING,trade_date 交易日期 DATETIME.DATE,trade_amount 交易金额 DECIMAL
 | ||
| 
 | ||
| print("已完成")
 | ||
| 
 | ||
| print("2 预处理,删除包含缺失值的样本和重复样本...", end="")
 | ||
| 
 | ||
| # 删除包含缺失值的样本
 | ||
| dataframe.dropna(inplace=True)
 | ||
| 
 | ||
| # 删除重复样本(保留第一例重复样本、重置索引)
 | ||
| dataframe = dataframe.drop_duplicates(ignore_index=True, inplace=False)
 | ||
| 
 | ||
| # 仅保留交易日期为2012和2013年的样本
 | ||
| dataframe = dataframe[
 | ||
|     dataframe["交易日期"].apply(lambda x: x.year in [2012, 2013])
 | ||
| ].reset_index(
 | ||
|     drop=True
 | ||
| )  # 因交易日期数据类型为DATETIME.DATE非DATETIME64,故无法使用SERIES.DT.YEAR方法
 | ||
| 
 | ||
| sample_size = Decimal(dataframe.shape[0]).quantize(Decimal("0"))
 | ||
| 
 | ||
| print("已完成")
 | ||
| 
 | ||
| print("3 构建RFM...", end="")
 | ||
| 
 | ||
| # 最远交易日期
 | ||
| min_trade_date = dataframe["交易日期"].min()
 | ||
| 
 | ||
| # R为最近一次交易日期距离样本中最远一次交易日期的天数(单位:日),DECIMAL;F为交易频率(单位;次),DECIMAL;M为交易金额(单位:元),DECIMAL。均正向化
 | ||
| rfm = (
 | ||
|     dataframe.groupby(by="客户ID")
 | ||
|     .agg(
 | ||
|         R=(
 | ||
|             "交易日期",
 | ||
|             lambda x: Decimal((x.max() - min_trade_date).days).quantize(
 | ||
|                 Decimal("0"), rounding=ROUND_HALF_UP
 | ||
|             ),
 | ||
|         ),
 | ||
|         F=(
 | ||
|             "客户ID",
 | ||
|             lambda x: Decimal(len(x)).quantize(Decimal("0"), rounding=ROUND_HALF_UP),
 | ||
|         ),
 | ||
|         M=(
 | ||
|             "交易金额",
 | ||
|             lambda x: sum(x, Decimal("0")).quantize(
 | ||
|                 Decimal("0.00"), rounding=ROUND_HALF_UP
 | ||
|             ),  # 求和时指定初始值为DECIMAL("0")
 | ||
|         ),
 | ||
|     )
 | ||
|     .reset_index()
 | ||
| )
 | ||
| 
 | ||
| # 客户数
 | ||
| customer_counts = Decimal(rfm.shape[0]).quantize(Decimal("0"))
 | ||
| 
 | ||
| # 总交易金额
 | ||
| trade_amounts = sum(rfm["M"], Decimal("0.00")).quantize(Decimal("0.00"))
 | ||
| 
 | ||
| print("已完成")
 | ||
| 
 | ||
| print("4 基于平均数将R、F和M分为低、高两个等级并组合为八种客户分类...", end="")
 | ||
| 
 | ||
| # R、F和M的平均数,使用STATISTICS.MEAN统计平均值,保证精度
 | ||
| # noinspection PyUnresolvedReferences
 | ||
| means = {
 | ||
|     "R": statistics.mean(rfm["R"]).quantize(Decimal("0.00"), rounding=ROUND_HALF_UP),
 | ||
|     "F": statistics.mean(rfm["F"]).quantize(Decimal("0.00"), rounding=ROUND_HALF_UP),
 | ||
|     "M": statistics.mean(rfm["M"]).quantize(Decimal("0.00"), rounding=ROUND_HALF_UP),
 | ||
| }
 | ||
| 
 | ||
| rfm = rfm.assign(
 | ||
|     客户分类=lambda dataframe: dataframe.apply(
 | ||
|         lambda row: map_classification(
 | ||
|             r_encoded=0 if row["R"] <= means["R"] else 1,
 | ||
|             f_encoded=0 if row["F"] <= means["F"] else 1,
 | ||
|             m_encoded=0 if row["M"] <= means["M"] else 1,
 | ||
|         ),
 | ||
|         axis="columns",
 | ||
|     )
 | ||
| )
 | ||
| 
 | ||
| dataframe = dataframe.merge(right=rfm[["客户ID", "客户分类"]], on="客户ID", how="left")
 | ||
| 
 | ||
| print("已完成")
 | ||
| 
 | ||
| print("5 生成分析报告...", end="")
 | ||
| 
 | ||
| draw = DrawAsHTML()
 | ||
| 
 | ||
| # 生成数据预览
 | ||
| draw.table(
 | ||
|     dataframe=dataframe.sample(5),
 | ||
|     file_name="数据预览.html",
 | ||
| )
 | ||
| 
 | ||
| # 客户分类维度
 | ||
| customer_types = (
 | ||
|     rfm.groupby(by="客户分类")  # 按照客户分类分组
 | ||
|     .agg(
 | ||
|         R=(
 | ||
|             "R",
 | ||
|             lambda x: statistics.mean(x).quantize(
 | ||
|                 Decimal("0.00"), rounding=ROUND_HALF_UP
 | ||
|             ),
 | ||
|         ),  # R平均值
 | ||
|         F=(
 | ||
|             "F",
 | ||
|             lambda x: statistics.mean(x).quantize(
 | ||
|                 Decimal("0.00"), rounding=ROUND_HALF_UP
 | ||
|             ),
 | ||
|         ),  # F平均值
 | ||
|         M=(
 | ||
|             "M",
 | ||
|             lambda x: statistics.mean(x).quantize(
 | ||
|                 Decimal("0.00"), rounding=ROUND_HALF_UP
 | ||
|             ),
 | ||
|         ),  # M平均值
 | ||
|         客户占比=(
 | ||
|             "客户分类",
 | ||
|             lambda x: (Decimal(len(x)) / customer_counts * Decimal("100")).quantize(
 | ||
|                 Decimal("0.00"), rounding=ROUND_HALF_UP
 | ||
|             ),
 | ||
|         ),  # 统计各客户分类的客户占比
 | ||
|         交易金额占比=(
 | ||
|             "M",
 | ||
|             lambda x: Decimal(
 | ||
|                 sum(x, Decimal("0.00")) / trade_amounts * Decimal("100")
 | ||
|             ).quantize(Decimal("0.00"), rounding=ROUND_HALF_UP),
 | ||
|         ),  # 统计各客户分类的交易金额占比
 | ||
|     )
 | ||
|     .reset_index()
 | ||
| )
 | ||
| 
 | ||
| # 生成客户分类分布
 | ||
| draw.scatter(
 | ||
|     dataframe=customer_types[["客户分类", "R", "F", "M"]],
 | ||
|     xaxis_opts_min=475,
 | ||
|     xaxis_opts_max=750,
 | ||
|     file_name="客户分类分布.html",
 | ||
| )
 | ||
| 
 | ||
| # 生成客户占比
 | ||
| draw.pie(
 | ||
|     dataframe=customer_types[["客户分类", "客户占比"]].sort_values(
 | ||
|         by="客户占比", ascending=False
 | ||
|     ),  # 按照客户占比降序
 | ||
|     file_name="客户占比.html",
 | ||
| )
 | ||
| 
 | ||
| # 生成交易金额占比
 | ||
| draw.pie(
 | ||
|     dataframe=customer_types[["客户分类", "交易金额占比"]].sort_values(
 | ||
|         by="交易金额占比", ascending=False
 | ||
|     ),  # 按照交易金额占比降序
 | ||
|     file_name="交易金额占比.html",
 | ||
| )
 | ||
| 
 | ||
| report_backward = pandas.DataFrame(data=[], columns=["客户分类", "窗口期", "客户数"])
 | ||
| 
 | ||
| for customer_type in customer_types["客户分类"]:
 | ||
| 
 | ||
|     for month in range(1, 13):
 | ||
| 
 | ||
|         # 窗口期,从2013-01至2013-12
 | ||
|         period = f"2013-{month:02d}"
 | ||
| 
 | ||
|         # 窗口期起期(向前滑动十二个月,包括当月)
 | ||
|         period_start = (
 | ||
|             pandas.Period(value=f"2013-{month:02d}", freq="M") - 11
 | ||
|         ).start_time.date()
 | ||
| 
 | ||
|         # 窗口期止期
 | ||
|         period_end = pandas.Period(value=f"2013-{month:02d}", freq="M").end_time.date()
 | ||
| 
 | ||
|         # 指定客户分类窗口期内客户数
 | ||
|         customer_counts = dataframe.loc[
 | ||
|             (dataframe["客户分类"] == customer_type)
 | ||
|             & (dataframe["交易日期"] >= period_start)
 | ||
|             & (dataframe["交易日期"] <= period_end),
 | ||
|             "客户ID",
 | ||
|         ].nunique()
 | ||
| 
 | ||
|         report_backward.loc[report_backward.shape[0]] = [
 | ||
|             customer_type,
 | ||
|             period,
 | ||
|             customer_counts,
 | ||
|         ]
 | ||
| 
 | ||
| # 生成近十二个自然月客户数趋势
 | ||
| draw.area(
 | ||
|     dataframe=report_backward.groupby(by="窗口期", as_index=False).agg(
 | ||
|         客户数=("客户数", "sum")
 | ||
|     ),
 | ||
|     file_name="近十二个自然月客户数趋势.html",
 | ||
|     yaxis_opts_min=1350,
 | ||
| )
 | ||
| 
 | ||
| report_backward = report_backward.loc[
 | ||
|     report_backward["客户分类"].isin(
 | ||
|         ["新客户", "流失客户", "重要价值客户"]
 | ||
|     )  # 仅考虑新客户、流水客户、重要价值客户
 | ||
| ].assign(
 | ||
|     总客户数=lambda x: x.groupby(by="窗口期")["客户数"].transform(
 | ||
|         "sum"
 | ||
|     ),  # 统计窗口期总客户数并新增值各行
 | ||
|     客户占比=lambda x: x.apply(
 | ||
|         lambda y: (
 | ||
|             Decimal(y["客户数"]) / Decimal(y["总客户数"]) * Decimal("100")
 | ||
|         ).quantize(
 | ||
|             Decimal("0.00"), rounding=ROUND_HALF_UP
 | ||
|         ),  # 运算各项使用DECIMAL以控制精度
 | ||
|         axis="columns",
 | ||
|     ),
 | ||
| )
 | ||
| 
 | ||
| # 生成近十二个自然月客户占比趋势(仅考虑新客户、流失客户和重要价值客户)
 | ||
| draw.bar(
 | ||
|     dataframe=report_backward[
 | ||
|         ["客户分类", "窗口期", "客户占比"]
 | ||
|     ],  # 仅保留客户分类、窗口期和占比
 | ||
|     file_name="近十二个自然月客户占比趋势.html",
 | ||
|     stack=True,
 | ||
| )
 | ||
| 
 | ||
| report_forward = (
 | ||
|     dataframe.assign(
 | ||
|         最早交易日期=lambda dataframe: dataframe.groupby(by="客户ID")[
 | ||
|             "交易日期"
 | ||
|         ].transform("min"),
 | ||
|     )  # 统计每位客户最早交易日期
 | ||
|     .assign(
 | ||
|         周期=lambda dataframe: (
 | ||
|             dataframe["交易日期"].apply(lambda x: x.year)
 | ||
|             - dataframe["最早交易日期"].apply(lambda x: x.year)
 | ||
|         )
 | ||
|         * 12
 | ||
|         + (
 | ||
|             dataframe["交易日期"].apply(lambda x: x.month)
 | ||
|             - dataframe["最早交易日期"].apply(lambda x: x.month)
 | ||
|         )
 | ||
|     )  # 每笔交易的交易日期和对客户最早交易日期的间隔作为周期,单位为月
 | ||
|     .assign(
 | ||
|         周期=lambda dataframe: dataframe["周期"].apply(lambda x: f"M+{x:02d}")
 | ||
|     )  # 格式化周期
 | ||
|     .assign(
 | ||
|         群组=lambda dataframe: dataframe["最早交易日期"].apply(
 | ||
|             lambda x: f"{x.year}-{x.month:02d}"
 | ||
|         )
 | ||
|     )  # 截取最早交易日期的年月作为群组
 | ||
|     .groupby(by=["客户分类", "群组", "周期"], as_index=False)
 | ||
|     .agg(客户数=("客户ID", "nunique"))  #  COHORT-ANALYSIS,群组-周期矩阵
 | ||
|     .groupby(by=["客户分类", "周期"], as_index=False)
 | ||
|     .agg(客户数=("客户数", "sum"))  # 统计各客户分类各周期用户数
 | ||
|     .assign(
 | ||
|         基准客户数=lambda dataframe: dataframe.groupby(by=["客户分类"])[
 | ||
|             "客户数"
 | ||
|         ].transform("first")
 | ||
|     )
 | ||
|     .assign(
 | ||
|         留存率=lambda dataframe: dataframe.apply(
 | ||
|             lambda x: (
 | ||
|                 Decimal(x["客户数"]) / Decimal(x["基准客户数"]) * Decimal("100")
 | ||
|             ).quantize(Decimal("0.00"), rounding=ROUND_HALF_UP),
 | ||
|             axis="columns",
 | ||
|         )
 | ||
|     )
 | ||
|     .loc[
 | ||
|         lambda dataframe: dataframe["客户分类"].isin(
 | ||
|             ["新客户", "流失客户", "重要价值客户"]
 | ||
|         )
 | ||
|         & dataframe["周期"].isin(
 | ||
|             [
 | ||
|                 "M+01",
 | ||
|                 "M+02",
 | ||
|                 "M+03",
 | ||
|                 "M+04",
 | ||
|                 "M+05",
 | ||
|                 "M+06",
 | ||
|                 "M+07",
 | ||
|                 "M+08",
 | ||
|                 "M+09",
 | ||
|                 "M+10",
 | ||
|                 "M+11",
 | ||
|                 "M+12",
 | ||
|             ]
 | ||
|         )
 | ||
|     ]
 | ||
|     .reset_index(drop=True)
 | ||
| )
 | ||
| 
 | ||
| # 生成近十二个自然月留存率趋势
 | ||
| draw.line(
 | ||
|     dataframe=report_forward[["客户分类", "周期", "留存率"]],
 | ||
|     file_name="近十二个自然月留存率趋势.html",
 | ||
| )
 | ||
| 
 | ||
| # 获取报告模版
 | ||
| template = Environment(loader=FileSystemLoader(".")).get_template("template.html")
 | ||
| 
 | ||
| # 渲染模版
 | ||
| rfm_report = template.render(
 | ||
|     {
 | ||
|         # 报告日期
 | ||
|         "report_date": datetime.now().strftime("%Y-%m-%d"),
 | ||
|         "sample_size": sample_size,
 | ||
|     }
 | ||
| )
 | ||
| 
 | ||
| with open("rfm_report.html", "w", encoding="utf8") as file:
 | ||
|     file.write(rfm_report)
 | ||
| 
 | ||
| print("已完成")
 |