1026 lines
31 KiB
Python
1026 lines
31 KiB
Python
# -*- coding: utf-8 -*-
|
||
|
||
"""
|
||
脚本说明:
|
||
推荐系统,召回阶段使用基于用户、物品协同过滤算法生成候选物品列表,精排阶段使用时序建模和多头注意力机制进一步精排最终形成推荐物品列表
|
||
"""
|
||
|
||
import heapq
|
||
import time
|
||
from collections import defaultdict, deque
|
||
from itertools import islice
|
||
from operator import itemgetter
|
||
from typing import Literal
|
||
|
||
import numpy
|
||
from pyLSHash import LSHash
|
||
# 导入模块
|
||
from pydantic import BaseModel, Field, model_validator
|
||
|
||
"""
|
||
|
||
编码区域
|
||
|
||
"""
|
||
|
||
|
||
class InitializationArguments(BaseModel):
|
||
"""推荐系统初始化参数数据模型"""
|
||
|
||
# 时间窗口(单位为天),平衡实时性和运算效率
|
||
time_window: int = Field(default=30, ge=5, le=360)
|
||
|
||
# 衰减因子兰布达系数,控制兴趣分数衰减速率
|
||
decay_lambda: float = Field(default=0, ge=0.00, le=10)
|
||
|
||
# 用户特征向量维度数
|
||
attributes_dimensions: int = Field(default=10, ge=2.00, le=200)
|
||
|
||
|
||
class Attributes(BaseModel):
|
||
"""用户属性数据模型"""
|
||
|
||
# 年龄
|
||
age: int = Field(default=..., ge=1, le=99)
|
||
|
||
# 性别
|
||
gender: Literal["male", "female"] = Field(default=...)
|
||
|
||
# 市
|
||
city: str = Field(default=...)
|
||
|
||
# 职业
|
||
occupation: str = Field(default=...)
|
||
|
||
|
||
class Record(BaseModel):
|
||
"""记录数据模型"""
|
||
|
||
# 用户标识
|
||
user: str = Field(default=..., min_length=6, max_length=6)
|
||
|
||
# 用户属性
|
||
attributes: Attributes = Field(default=...)
|
||
|
||
|
||
class BehaviorRecord(BaseModel):
|
||
"""用户行为记录数据模型"""
|
||
|
||
# 用户标识
|
||
user: str = Field(default=..., min_length=6, max_length=6)
|
||
|
||
# 时间戳
|
||
timestamp: int = Field(default=...)
|
||
|
||
# 行为类型
|
||
type: Literal["rating"] = Field(default=...)
|
||
|
||
# 物品标识
|
||
item: str = Field(default=..., min_length=6, max_length=6)
|
||
|
||
# 评分
|
||
rating: int | None = Field(default=None, ge=1, le=5)
|
||
|
||
@model_validator(mode="after")
|
||
def _validate_rating(self):
|
||
if self.type == "rating" and not self.rating:
|
||
raise ValueError("若行为类型为评分则拼分必填")
|
||
return self
|
||
|
||
|
||
class RecommenderSystem:
|
||
"""基于协同过滤的推荐系统"""
|
||
|
||
def __init__(self, **arguments):
|
||
"""初始化推荐系统"""
|
||
|
||
# 行为参数配置项(不同行为类型配置不同的的兴趣基础分数和衰减权重)
|
||
self.behavior_arguments = {"rating": (None, 1)}
|
||
|
||
# 校验并解析初始化参数
|
||
arguments = InitializationArguments(**arguments).model_dump()
|
||
self.time_window = arguments.get("time_window")
|
||
self.decay_lambda = arguments.get("decay_lambda")
|
||
self.attributes_dimensions = arguments.get("attributes_dimensions")
|
||
|
||
# 用户行为数据储存对象
|
||
self.behaviors = defaultdict(
|
||
lambda: {
|
||
# 兴趣分数列表(时间窗口内由物品标识和兴趣分数组成的字典),为最新兴趣分数,形如{'物品标识': '兴趣分数'}
|
||
"scores": defaultdict(float),
|
||
# 时间戳堆(时间窗口内由时间戳和物品标识组成的元组),例如('时间戳', '物品标识')
|
||
"timestamps_heap": [],
|
||
# 历史物品标识列表(默认最多保存200例历史物品标识,平衡推荐系统实时性和运算效率),形如'物品标识'
|
||
"items_history": deque(maxlen=200),
|
||
}
|
||
)
|
||
|
||
# 在计算物品标识-物品标识余弦相似度时可分解为分子部分和分母平方部分并在新增/更新用户行为时增量更新,以优化运算效率
|
||
|
||
# 计算物品标识-物品标识余弦相似度时分子部分
|
||
self.items_similarity_numerator = defaultdict(float)
|
||
|
||
# 计算物品标识-物品标识余弦相似度时分母平方部分
|
||
self.items_similarity_denominator_square = defaultdict(float)
|
||
|
||
# 物品标识倒排表,形如{'物品标识': ['用户标识']}
|
||
self.items_inversion = defaultdict(list)
|
||
|
||
# 用户特征向量数据体(基于用户协同过滤的核心数据体,用户属性以独热编码方式保存)
|
||
self.attributes = defaultdict(lambda: numpy.zeros(self.attributes_dimensions))
|
||
|
||
# 基于LSHash作为用户特征向量索引器(默认哈希值的二进制位数为8,哈希表数为2,哈希矩阵持久化路径)
|
||
self.attributes_indexer = LSHash(
|
||
hash_size=8, input_dim=self.attributes_dimensions, num_hashtables=2
|
||
)
|
||
|
||
# 处理用户属性记录
|
||
def process_attribute_record(self, attribute_record: dict):
|
||
|
||
# 校验设置记录数据模型
|
||
attribute_record = SettingRecord(**attribute_record).model_dump()
|
||
|
||
user = attribute_record.get("user")
|
||
|
||
for key, value in attribute_record.get("attributes").items():
|
||
|
||
# 若用户属性值非空
|
||
if value:
|
||
|
||
# 更新用户属性
|
||
self.behaviors[user]["attributes"][key] = value
|
||
|
||
return True
|
||
|
||
# 处理用户行为记录
|
||
def process_behavior_record(self, behavior_record: dict):
|
||
|
||
# 校验行为记录数据模型,数据类型转为字典
|
||
behavior_record = BehaviorRecord(**behavior_record).model_dump()
|
||
|
||
# 用户标识
|
||
user = behavior_record["user"]
|
||
|
||
# 时间戳
|
||
timestamp = behavior_record["timestamp"]
|
||
|
||
# 行为类型
|
||
type = behavior_record["type"]
|
||
|
||
# 物品标识
|
||
item = behavior_record["item"]
|
||
|
||
# 评分,若行为类型为评分则评分必填,评分数据类型为整数,指定范围1~5
|
||
rating = behavior_record["rating"]
|
||
|
||
# 整理用户行为数据
|
||
self._reorganize_behaviors(user=user)
|
||
|
||
# 原兴趣分数
|
||
score_past = self.behaviors[user]["scores"][item]
|
||
|
||
# 现兴趣分数
|
||
score = self._calculate_score(timestamp=timestamp, type=type, rating=rating)
|
||
|
||
# 若现兴趣分数大于原兴趣分数
|
||
if score > score_past:
|
||
|
||
# 更新兴趣分数列表
|
||
self.behaviors[user]["scores"][item] = score
|
||
|
||
# 更新时间戳堆
|
||
heapq.heappush(self.behaviors[user]["timestamps_heap"], (timestamp, item))
|
||
|
||
# 更新历史物品标识列表
|
||
self.behaviors[user]["items_history"].append(item)
|
||
|
||
# 更新计算物品标识-物品标识余弦相似度的分子和分子平方部分
|
||
self._update_items_similarity_components(
|
||
user=user, item=item, score_old=score_past, score_new=score
|
||
)
|
||
|
||
# 若用户标识在物品标识倒排表中索引为物品标识的用户标识列表
|
||
if user not in self.items_inversion[item]:
|
||
|
||
# 更新倒排表
|
||
self.items_inversion[item].append(user)
|
||
|
||
# 整理用户行为数据(基于时间窗口清理过期数据)
|
||
def _reorganize_behaviors(self, user):
|
||
|
||
# 时间戳堆
|
||
timestamps_heap = self.behaviors[user]["timestamps_heap"]
|
||
|
||
# 若时间戳堆非空列表且现在距时间戳堆中第一元组的时间戳的时距大于时间窗口
|
||
while (
|
||
timestamps_heap
|
||
and (time.time() - timestamps_heap[0][0]) / 86400 > self.time_window
|
||
):
|
||
|
||
# 删除时间戳堆中第一元组并获取过期时间戳和过期物品标识
|
||
timestamp_expired, item_expired = heapq.heappop(timestamps_heap)
|
||
|
||
# 若过期物品标识的兴趣分数非空
|
||
if self.behaviors[user]["scores"][item_expired]:
|
||
|
||
# 获取过期物品标识的兴趣分数
|
||
score_expired = self.behaviors[user]["scores"][item_expired]
|
||
|
||
# 在兴趣分数列表删除索引为过期物品标识的项
|
||
del self.behaviors[user]["scores"][item_expired]
|
||
|
||
# 若过期物品标识在历史物品标识列表
|
||
if item_expired in self.behaviors[user]["items_history"]:
|
||
|
||
# 在历史物品标识列表删除过期物品标识
|
||
self.behaviors[user]["items_history"].remove(item_expired)
|
||
|
||
# 更新更新计算物品标识-物品标识余弦相似度的分子和分子平方部分
|
||
self._update_items_similarity_components(
|
||
user=user, item=item_expired, score_old=score_expired, score_new=0
|
||
)
|
||
|
||
# 若用户标识在物品标识倒排表索引为过期物品标识的用户标识列表
|
||
if user in self.items_inversion[item_expired]:
|
||
|
||
# 在用户标识列表删除用户标识
|
||
self.items_inversion[item_expired].remove(user)
|
||
|
||
# 若物品标识倒排表中用户标识列表为空列表
|
||
if not self.items_inversion[item_expired]:
|
||
|
||
# 在物品标识倒排表删除索引为物品标识的项
|
||
del self.items_inversion[item_expired]
|
||
|
||
# 计算兴趣分数
|
||
def _calculate_score(self, timestamp, type, rating):
|
||
|
||
# 在时间窗口内,兴趣分数随时间以指数函数衰减
|
||
|
||
# 时距,单位为天
|
||
time_interval = (time.time() - timestamp) / 86400
|
||
|
||
# 若时距大于时间窗口
|
||
if time_interval > self.time_window:
|
||
|
||
return 0
|
||
|
||
# 基础兴趣分数,衰减因子权重
|
||
score_base, weight = self.behavior_arguments.get(type)
|
||
|
||
# 若行为类型为评分
|
||
if type == "rating":
|
||
|
||
# 基础兴趣分数经非线性转化为0.2至0.8
|
||
score_base = 0.2 + 0.6 * (1 / (1 + numpy.exp(3 - rating)))
|
||
|
||
# 加权后衰减因子兰布达系数
|
||
decay_lambda_weighted = self.decay_lambda * weight
|
||
|
||
# 基于指数函数计算兴趣评分
|
||
score = score_base * numpy.exp(0 - decay_lambda_weighted * time_interval)
|
||
|
||
return score
|
||
|
||
# 更新计算物品标识-物品标识余弦相似度的分子和分子平方部分
|
||
def _update_items_similarity_components(self, user, item, score_old, score_new):
|
||
|
||
for item_history in self.behaviors[user]["items_history"]:
|
||
|
||
if item_history != item:
|
||
|
||
# 构建物品标识-物品标识的有序物品标识对
|
||
pair = tuple(sorted((item_history, item)))
|
||
|
||
self.items_similarity_numerator[pair] += (
|
||
score_new - score_old
|
||
) * self.behaviors[user]["scores"][item_history]
|
||
|
||
self.items_similarity_denominator_square[item] += score_new**2 - score_old**2
|
||
|
||
# 生成推荐列表
|
||
def generate_recommendations(self, user, k=10):
|
||
|
||
# 推荐物品标识列表
|
||
items_candidates = defaultdict(float)
|
||
|
||
# 整理用户行为数据
|
||
self._reorganize_behaviors(user=user)
|
||
|
||
# 最大候选数
|
||
maximum_candidates = k * 10
|
||
|
||
behaviors = self.behaviors[user]
|
||
|
||
# 历史物品标识列表
|
||
items_history = behaviors["items_history"]
|
||
|
||
# 基于物品协同过滤算法生成的候选物品兴趣分数权重
|
||
alpha_weight = 0.2 / (1 + numpy.exp(0.05 * len(items_history) - 1.2)) + 0.65
|
||
|
||
# 基于物品协同过滤算法生成候选物品标识列表
|
||
candidates_items = self._generate_items_candidates(
|
||
user=user, maximum_candidates=maximum_candidates
|
||
)
|
||
|
||
# 基于用户协同过滤算法生成候选物品标识列表
|
||
candidates_users = self._generate_users_candidates(
|
||
user=user, maximum_candidates=maximum_candidates
|
||
)
|
||
|
||
# 合并基于物品协同过滤算法生成的候选物品标识列表和基于用户协同过滤算法生成候的选物品标识列表
|
||
for item_candidate in candidates_items.keys() | candidates_users.keys():
|
||
|
||
items_candidates[item_candidate] = candidates_items[
|
||
item_candidate
|
||
] * alpha_weight + candidates_users[item_candidate] * (1 - alpha_weight)
|
||
|
||
return dict(
|
||
islice(sorted(items_candidates.items(), key=itemgetter(1), reverse=True), k)
|
||
)
|
||
|
||
# 基于物品协同过滤算法生成候选物品标识列表
|
||
def _generate_items_candidates(self, user, maximum_candidates):
|
||
|
||
# 召回物品标识列表
|
||
items_recall = defaultdict(lambda: {"counts": 0, "scores": 0})
|
||
|
||
behaviors = self.behaviors[user]
|
||
|
||
# 历史物品标识列表作为启发物品标识列表
|
||
items_heuristic = behaviors["items_history"]
|
||
|
||
# 先通过启发式物品标识在物品标识倒排表查询索引为启发式物品标识的启发式用户标识,再通过启发式用户标识查询历史物品标识列表作为候选物品标识
|
||
for item_heuristic in items_heuristic:
|
||
|
||
for user_heuristic in self.items_inversion[item_heuristic]:
|
||
|
||
# 若通过启发式物品标识在物品标识倒排表查询索引为启发式物品标识的启发式用户标识和用户标识不一致
|
||
if user_heuristic != user:
|
||
|
||
for item_recall in self.behaviors[user_heuristic]["items_history"]:
|
||
|
||
# 若召回物品标识不在启发物品标识列表
|
||
if item_recall not in items_heuristic:
|
||
|
||
items_recall[item_recall]["counts"] += 1
|
||
|
||
# 遍历启发式物品标识列表和召回物品标识列表(召回物品标识列表不可能包含启发式物品标识),计算余弦相似度
|
||
for item_heuristic in items_heuristic:
|
||
|
||
# 在物品标识倒排表查询索引为启发式物品标识的用户标识列表
|
||
users_heuristic = self.items_inversion[item_heuristic]
|
||
|
||
for item_recall in items_recall:
|
||
|
||
# 在物品标识倒排表查询索引为召回物品标识的用户标识列表
|
||
users_recall = self.items_inversion[item_recall]
|
||
|
||
# 计算物品标识-物品标识余弦相似度时分母部分
|
||
items_similarity_denominator = numpy.sqrt(
|
||
self.items_similarity_denominator_square[item_heuristic]
|
||
* self.items_similarity_denominator_square[item_recall]
|
||
)
|
||
|
||
# 计算物品标识-物品标识余弦相似度时分母部分不为0
|
||
if items_similarity_denominator != 0:
|
||
|
||
# 构建物品标识-物品标识的有序物品标识对
|
||
pair = tuple(sorted((item_heuristic, item_recall)))
|
||
|
||
# 余弦相似度
|
||
similarity = (
|
||
self.items_similarity_numerator[pair]
|
||
/ items_similarity_denominator
|
||
)
|
||
|
||
else:
|
||
|
||
similarity = 0
|
||
|
||
# 在物品标识倒排表查询索引为历史物品标识的用户标识列表和在物品标识倒排表查询索引为召回物品标识的用户标识列表共同用户标识列表
|
||
users_common = list(set(users_heuristic) & set(users_recall))
|
||
|
||
# 抑制流行物品因子
|
||
popularity_suppressed = len(users_common) / numpy.sqrt(
|
||
len(users_heuristic) * len(users_recall)
|
||
)
|
||
|
||
items_recall[item_recall]["scores"] += (
|
||
behaviors["scores"][item_heuristic]
|
||
* similarity
|
||
* popularity_suppressed
|
||
)
|
||
|
||
# 归一化候选物品标识列表
|
||
candidates = self._normalize_candidates(
|
||
items_recall=items_recall, maximum_candidates=maximum_candidates
|
||
)
|
||
|
||
return candidates
|
||
|
||
# 基于用户协同过滤算法生成候选物品标识列表
|
||
def _generate_users_candidates(self, user, maximum_candidates):
|
||
|
||
# 召回物品标识列表
|
||
items_recall = defaultdict(lambda: {"counts": 0, "scores": 0})
|
||
|
||
attributes = self.attributes[user]
|
||
|
||
# 若用户特征向量非初始化特征向量
|
||
if numpy.all(attributes != 0):
|
||
|
||
# 基于LSHash查询与用户特征向量相似的用户标识作为召回用户标识
|
||
for _, similarity, user_recall in self.attributes_indexer.query(
|
||
query_vector=attributes,
|
||
num_results=maximum_candidates,
|
||
dist_func="cosine",
|
||
):
|
||
|
||
behaviors_recall = self.behaviors[user_recall]
|
||
|
||
# 召回用户标识的历史物品标识作为召回物品标识
|
||
for item_recall in behaviors_recall["items_history"]:
|
||
|
||
# 若召回物品标识不在历史物品标识列表
|
||
if item_recall not in self.behaviors[user]["items_history"]:
|
||
|
||
items_recall[item_recall]["counts"] += 1
|
||
|
||
items_recall[item_recall]["scores"] += (
|
||
behaviors_recall["scores"][item_recall] * similarity
|
||
)
|
||
|
||
# 归一化候选物品标识列表
|
||
candidates = self._normalize_candidates(
|
||
items_recall=items_recall, maximum_candidates=maximum_candidates
|
||
)
|
||
|
||
return candidates
|
||
|
||
# 归一化候选物品标识列表
|
||
def _normalize_candidates(self, items_recall, maximum_candidates):
|
||
|
||
# 候选物品标识列表
|
||
candidates = defaultdict(float)
|
||
|
||
# 若召回物品标识列表非空字典
|
||
if items_recall:
|
||
|
||
# 候选物品兴趣分数
|
||
scores = [nest["scores"] for nest in items_recall.values()]
|
||
|
||
# 候选物品相似分数最小值
|
||
scores_minimum = min(scores, default=0)
|
||
|
||
# 候选物品相似分数最大值
|
||
scores_maximum = max(scores, default=0)
|
||
|
||
# 值距
|
||
scores_range = scores_maximum - scores_minimum
|
||
|
||
# 若值距不为0
|
||
if scores_range != 0:
|
||
|
||
# 兴趣分数归一化
|
||
for item_recall in items_recall:
|
||
|
||
candidates[item_recall] = (
|
||
(items_recall[item_recall]["scores"] - scores_minimum)
|
||
/ scores_range
|
||
) * 0.6 + 0.2
|
||
|
||
else:
|
||
|
||
# 兴趣分数默认为0.8
|
||
for item_recall in items_recall:
|
||
|
||
candidates[item_recall] = 0.8
|
||
|
||
# 根据兴趣分数倒序排序并截取
|
||
candidates = dict(
|
||
islice(
|
||
sorted(candidates.items(), key=itemgetter(1), reverse=True),
|
||
maximum_candidates,
|
||
)
|
||
)
|
||
|
||
return candidates
|
||
|
||
|
||
if __name__ == "__main__":
|
||
|
||
# 初始化推荐系统
|
||
recommender = RecommenderSystem()
|
||
|
||
feedback_records = [
|
||
{
|
||
"user": "aaaaaa",
|
||
"item": "111111",
|
||
"type": "rating",
|
||
"timestamp": int(time.time() - 3600),
|
||
"rating": 4,
|
||
},
|
||
{
|
||
"user": "aaaaaa",
|
||
"item": "333333",
|
||
"type": "rating",
|
||
"timestamp": int(time.time() - 3200),
|
||
"rating": 4,
|
||
},
|
||
{
|
||
"user": "bbbbbb",
|
||
"item": "333333",
|
||
"type": "rating",
|
||
"timestamp": int(time.time() - 3200),
|
||
"rating": 4,
|
||
},
|
||
{
|
||
"user": "cccccc",
|
||
"item": "111111",
|
||
"type": "rating",
|
||
"timestamp": int(time.time() - 3200),
|
||
"rating": 5,
|
||
},
|
||
{
|
||
"user": "cccccc",
|
||
"item": "222222",
|
||
"type": "rating",
|
||
"timestamp": int(time.time() - 3200),
|
||
"rating": 5,
|
||
},
|
||
{
|
||
"user": "cccccc",
|
||
"item": "333333",
|
||
"type": "rating",
|
||
"timestamp": int(time.time() - 3200),
|
||
"rating": 3,
|
||
},
|
||
]
|
||
|
||
for feedback_record in feedback_records:
|
||
|
||
recommender.process_behavior_record(behavior_record=feedback_record)
|
||
|
||
a = recommender.generate_recommendations(user="cccccc")
|
||
|
||
print(a)
|
||
|
||
|
||
exit()
|
||
|
||
"""
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
import numpy
|
||
|
||
import pandas
|
||
|
||
import zipcodes
|
||
|
||
import re
|
||
|
||
from collections import Counter
|
||
|
||
from scipy.stats import chisquare
|
||
|
||
from scipy.stats.contingency import association
|
||
|
||
from sklearn.preprocessing import OneHotEncoder, StandardScaler
|
||
|
||
import math
|
||
|
||
from minisom import MiniSom
|
||
|
||
from sklearn.cluster import KMeans
|
||
|
||
import warnings
|
||
|
||
#忽略警告
|
||
warnings.simplefilter('ignore')
|
||
|
||
import sys
|
||
|
||
sys.path.append('..')
|
||
|
||
from utils.pandas2chart import Pandas2chart
|
||
|
||
from utils.algorithms import OptimalClusters
|
||
|
||
#本脚本中所调用的函数
|
||
|
||
#提取性别特征时将特征值“M”映射为“male”,“F”映射为“female”
|
||
def Gender(element):
|
||
|
||
match element:
|
||
|
||
case 'M':
|
||
|
||
return 'male'
|
||
|
||
case 'F':
|
||
|
||
return 'female'
|
||
|
||
case default:
|
||
|
||
return numpy.nan
|
||
|
||
#提取年龄特征时将小于18岁映射为“under18”,大于等于18岁且小于等于24岁映射为“18~24”,大于等于25岁且小于等于34岁映射为“25~34”,大于等于35岁且小于等于44岁映射为“35~44”,大于等于45岁且小于等于54岁映射为“45~54”,大于54岁映射为“above54””
|
||
def Age(element):
|
||
|
||
match element:
|
||
|
||
case age if age > 54:
|
||
|
||
return 'above54'
|
||
|
||
case age if age >= 45:
|
||
|
||
return '45~54'
|
||
|
||
case age if age >= 35:
|
||
|
||
return '35~44'
|
||
|
||
case age if age >= 25:
|
||
|
||
return '25~34'
|
||
|
||
case age if age >= 18:
|
||
|
||
return '18~24'
|
||
|
||
case age if age < 18:
|
||
|
||
return 'under18'
|
||
|
||
case default:
|
||
|
||
return numpy.nan
|
||
|
||
#提取职业特征时根据映射表映射
|
||
def Occupation(element):
|
||
|
||
match element:
|
||
|
||
case 0:
|
||
|
||
return 'other'
|
||
|
||
case 1:
|
||
|
||
return 'academic/educator'
|
||
|
||
case 2:
|
||
|
||
return 'artist'
|
||
|
||
case 3:
|
||
|
||
return 'clerical/admin'
|
||
|
||
case 4:
|
||
|
||
return 'college/grad student'
|
||
|
||
case 5:
|
||
|
||
return 'customer service'
|
||
|
||
case 6:
|
||
|
||
return 'doctor/health care'
|
||
|
||
case 7:
|
||
|
||
return 'executive/managerial'
|
||
|
||
case 8:
|
||
|
||
return 'farmer'
|
||
|
||
case 9:
|
||
|
||
return 'homemaker'
|
||
|
||
case 10:
|
||
|
||
return 'k-12 student'
|
||
|
||
case 11:
|
||
|
||
return 'lawyer'
|
||
|
||
case 12:
|
||
|
||
return 'programmer'
|
||
|
||
case 13:
|
||
|
||
return 'retired'
|
||
|
||
case 14:
|
||
|
||
return 'sales/marketing'
|
||
|
||
case 15:
|
||
|
||
return 'scientist'
|
||
|
||
case 16:
|
||
|
||
return 'self-employed'
|
||
|
||
case 17:
|
||
|
||
return 'technician/engineer'
|
||
|
||
case 18:
|
||
|
||
return 'tradesman/craftsman'
|
||
|
||
case 19:
|
||
|
||
return 'unemployed'
|
||
|
||
case 20:
|
||
|
||
return 'writer'
|
||
|
||
case default:
|
||
|
||
return numpy.nan
|
||
|
||
#提取州级行政区特征时,根据邮政编码模糊查询州级行政区,若为空或多个则映射为“null”,否则为查询结果
|
||
def State(element):
|
||
|
||
#校验邮政编码格式
|
||
if not re.match(r'^\d{4,5}$|^\d{5}-\d{4}$', element):
|
||
|
||
#若邮政编码由9位数字组成,修改邮政编码格式,否则为“null”
|
||
if re.match(r'^\d{9}$', element):
|
||
|
||
element = element[: 5] + '-' + element[-4: ]
|
||
|
||
else:
|
||
|
||
return numpy.nan
|
||
|
||
#根据邮政编码模糊查询并解析州级行政区
|
||
states = [element.get('state') for element in zipcodes.similar_to(element)]
|
||
|
||
#若州级行政区数为1则将查询结果定义为州级行政区,否则为“null”
|
||
if len(set(states)) == 1:
|
||
|
||
return states[0]
|
||
|
||
else:
|
||
|
||
return numpy.nan
|
||
|
||
#提取最近最喜欢的电影体裁特征
|
||
def Genres(series):
|
||
|
||
#合并每位用户评价过的电影体裁并切割为单个
|
||
genres = series.str.cat(sep = '|').split('|')
|
||
|
||
#查询数量最多的电影体裁并返回
|
||
return str.lower(Counter(genres).most_common(1)[0][0])
|
||
|
||
#独热编码特征名组合器
|
||
def Combiner(feature, category):
|
||
|
||
return str(feature) + ':' + str(category)
|
||
|
||
#若本脚本被调用报错
|
||
if __name__ != '__main__':
|
||
|
||
print('本脚本不允许被调用')
|
||
print()
|
||
|
||
exit()
|
||
|
||
print('1 打开本地数据文件,读取数据集...', end = '')
|
||
|
||
try:
|
||
|
||
dataset_users = pandas.read_csv(filepath_or_buffer = './MovieLens10K/users.csv', low_memory = False)
|
||
|
||
dataset_movies = pandas.read_csv(filepath_or_buffer = './MovieLens10K/movies.csv', low_memory = False)
|
||
|
||
dataset_ratings = pandas.read_csv(filepath_or_buffer ='./MovieLens10K/ratings.csv', low_memory = False)
|
||
|
||
except:
|
||
|
||
print('读取失败,请检查数据文件是否存在或正确')
|
||
print()
|
||
|
||
exit()
|
||
|
||
print('已完成')
|
||
print()
|
||
|
||
#评分数据集根据电影标识关联电影名称和体裁
|
||
dataset_ratings = dataset_ratings.merge(right = dataset_movies[['movieId', 'title', 'genres']], how = 'left', on = 'movieId')
|
||
|
||
#统计用户数
|
||
users = dataset_users.shape[0]
|
||
|
||
print('2 构建标签体系')
|
||
print()
|
||
|
||
print('2.1 提取特征...', end = '')
|
||
|
||
dataset = pandas.DataFrame(data = dataset_users['userId'].tolist(), columns = ['userId'])
|
||
|
||
#提取性别特征
|
||
dataset['gender'] = dataset_users['gender'].map(lambda element: Gender(element))
|
||
|
||
#提取年龄特征
|
||
dataset['age'] = dataset_users['age'].map(lambda element: Age(element))
|
||
|
||
#提取职业特征
|
||
dataset['occupation'] = dataset_users['occupation'].map(lambda element: Occupation(element))
|
||
|
||
#提取州级行政区特征
|
||
#dataset['state'] = dataset_users['zip'].map(lambda element: State(element))
|
||
|
||
#就评分数据集按照userId分组,统计每位用户最喜欢的体裁
|
||
pivottable_ratings = dataset_ratings.groupby(by = 'userId').agg(
|
||
|
||
#最喜欢的电影体裁
|
||
genres = pandas.NamedAgg(column = 'genres', aggfunc = Genres)
|
||
|
||
)
|
||
|
||
pivottable_ratings.reset_index(inplace = True)
|
||
|
||
#合并分箱后评分数和平均评分,另最喜欢的电影体裁
|
||
dataset = dataset.merge(right = pivottable_ratings[['userId', 'genres']], how = 'left', on = 'userId')
|
||
|
||
#删除用户ID和包含缺失值的样本
|
||
dataset.pop('userId').dropna(inplace = True)
|
||
|
||
print('已完成')
|
||
print()
|
||
|
||
#统计样本数
|
||
samples = dataset.shape[0]
|
||
|
||
#获取特征名称
|
||
independents = dataset.columns
|
||
|
||
print('特征数据集中样本数为 %d 例,特征数为 %d 个。' % (samples, len(independents)))
|
||
print()
|
||
|
||
print('2.2 检验各特征各项样本数是否符合均匀分布')
|
||
print()
|
||
|
||
for independent in independents:
|
||
|
||
#按照特征分组,统计各项样本数
|
||
pivottable = dataset.groupby(by = independent).agg(
|
||
|
||
samples = pandas.NamedAgg(column = independent, aggfunc = 'count')
|
||
|
||
)
|
||
|
||
#检验各项样本数是否均匀分布
|
||
statistic, probability = chisquare(f_obs = pivottable['samples'].to_numpy())
|
||
|
||
if probability < 0.05:
|
||
|
||
print('特征 %s 各项样本数不符合均匀分布,卡方统计量为 %.2f,概率为 %.2f 。' % (independent, statistic, probability))
|
||
print()
|
||
|
||
else:
|
||
|
||
print('特征 %s 各项样本数符合均匀分布,卡方统计量为 %.2f,概率为 %.2f 。' % (independent, statistic, probability))
|
||
print()
|
||
|
||
pivottable.reset_index(inplace = True)
|
||
|
||
#按照样本数倒序排序
|
||
pivottable.sort_values(by = 'samples', ascending = False, inplace = True)
|
||
|
||
#若项数大于指定值,则将第(指定值)项至最后一项合并为一项,指定值为6
|
||
if pivottable.shape[0] > 6:
|
||
|
||
pivottable_marging = pivottable.iloc[: 5]
|
||
|
||
#合并后的项名为others,统计第(指定值-1)项至最后一项样本数的和
|
||
pivottable_marging.loc[pivottable_marging.shape[0]] = ['others', pivottable.iloc[5: , 1].sum()]
|
||
|
||
else:
|
||
|
||
pivottable_marging = pivottable
|
||
|
||
#生成环形图
|
||
Pandas2chart(dataset = pivottable_marging, type = 'circular', path = './reports/persona_report/circular_{}.html'.format(independent))
|
||
|
||
print('2.3 统计特征之间相关系数')
|
||
print()
|
||
|
||
#用于保存特征之间克莱姆相关系数矩阵
|
||
correlation_matrix = pandas.DataFrame(data = [], index = independents, columns = independents)
|
||
|
||
#用于保存相关特征对
|
||
correlation_pairs = []
|
||
|
||
for index, independent_index in enumerate(independents):
|
||
|
||
for column, independent_column in enumerate(independents):
|
||
|
||
#统计特征之间克莱姆相关系数
|
||
statistic = round(association(observed = pandas.crosstab(index = dataset[independent_index], columns = dataset[independent_column])), 2)
|
||
|
||
correlation_matrix.loc[independent_index, independent_column] = statistic
|
||
|
||
#获取相关特征对
|
||
if column > index and statistic >= 0.25:
|
||
|
||
correlation_pairs.append({'independent': independent_index, 'independent_correlation': independent_column})
|
||
|
||
#生成相关系数矩阵热力图
|
||
Pandas2chart(dataset = correlation_matrix, type = 'heatmap', path = './reports/persona_report/heatmap_correlation_matrix.html')
|
||
|
||
print('3、构建用户细分群体')
|
||
print()
|
||
|
||
print('3.1 独热编码特征并标准化...', end = '')
|
||
|
||
#独热编码特征,用于决策树算法模型
|
||
onehot_encoder = OneHotEncoder(sparse_output = False, handle_unknown = 'ignore', feature_name_combiner = Combiner).fit(X = dataset.to_numpy())
|
||
|
||
dataset_processing = pandas.DataFrame(data = onehot_encoder.transform(X = dataset.to_numpy()), columns = onehot_encoder.get_feature_names_out(input_features = independents)).astype(dtype = 'int')
|
||
|
||
#独热编码特征
|
||
dataset_preprocessing = OneHotEncoder(sparse_output = False, handle_unknown = 'ignore').fit_transform(X = dataset.to_numpy())
|
||
|
||
#标准化特征
|
||
dataset_preprocessing = StandardScaler().fit_transform(X = dataset_preprocessing)
|
||
|
||
print('已完成')
|
||
print()
|
||
|
||
print('3.2 基于自我组织映射算法初步聚类...', end = '')
|
||
|
||
#定义竞争层的长度和高度(经验值)
|
||
competitive_layer_length = competitive_layer_heigth = math.ceil(2.25 * math.pow(samples, 0.25))
|
||
|
||
#创建自我组织映射算法模型
|
||
som = MiniSom(x = competitive_layer_length, y = competitive_layer_heigth, input_len = dataset_preprocessing.shape[1], sigma = math.sqrt(math.pow(competitive_layer_length, 2) + math.pow(competitive_layer_heigth, 2)), activation_distance = 'cosine', random_seed = 0)
|
||
|
||
#初始化模型
|
||
som.pca_weights_init(data = dataset_preprocessing)
|
||
|
||
#训练模型
|
||
som.train_batch(data = dataset_preprocessing, num_iteration = 10)
|
||
|
||
#获取各样本的竞争层中优胜点坐标
|
||
dataset_preprocessing = [som.winner(record) for record in dataset_preprocessing]
|
||
|
||
dataset_preprocessing = pandas.DataFrame(data = dataset_preprocessing, columns = ['axis_x', 'axis_y'])
|
||
|
||
print('已完成')
|
||
print()
|
||
|
||
print('3.3 就各样本的竞争层中优胜点坐标基于K均值算法再次聚类,使用间隔统计量评估聚类效果并确定最优聚类簇数...', end = '')
|
||
|
||
#创建K均值算法模型并训练
|
||
kmeans = KMeans(n_clusters = OptimalClusters(dataset_preprocessing.to_numpy()), n_init = 'auto').fit(dataset_preprocessing.to_numpy())
|
||
|
||
dataset_processing['cluster_label'] = kmeans.labels_
|
||
|
||
print('已完成')
|
||
print()
|
||
|
||
print('3.4 基于决策树拟合聚类结果并输出聚类规则...', end = '')
|
||
|
||
|
||
|
||
|
||
|
||
|
||
print(dataset_processing['cluster_label'].max())
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
"""
|