This commit is contained in:
liubiren 2025-10-30 00:57:18 +08:00
parent 1f3fb8eefa
commit c46f53e2f9
1 changed files with 202 additions and 275 deletions

View File

@ -48,6 +48,27 @@ class Attributes(BaseModel):
occupation: str = Field(default=...)
class BehaviorRecord(BaseModel):
"""用户行为记录数据模型"""
# 用户标识
user: str = Field(default=..., min_length=6, max_length=6)
# 时间戳
timestamp: int = Field(default=...)
# 行为类型
type_: Literal["rating"] = Field(default=...)
# 物品标识
item: str = Field(default=..., min_length=6, max_length=6)
# 评分
rating: int | None = Field(default=None, ge=1, le=5)
@model_validator(mode="after")
def _validate_rating(self):
if self.type_ == "rating" and not self.rating:
raise ValueError("若行为类型为评分则拼分必填")
return self
class AttributesRecord(BaseModel):
"""用户属性记录数据模型"""
@ -57,34 +78,13 @@ class AttributesRecord(BaseModel):
attributes: Attributes = Field(default=...)
class BehaviorRecord(BaseModel):
"""用户行为记录数据模型"""
# 用户标识
user: str = Field(default=..., min_length=6, max_length=6)
# 时间戳
timestamp: int = Field(default=...)
# 行为类型
type: Literal["rating"] = Field(default=...)
# 物品标识
item: str = Field(default=..., min_length=6, max_length=6)
# 评分
rating: int | None = Field(default=None, ge=1, le=5)
@model_validator(mode="after")
def _validate_rating(self):
if self.type == "rating" and not self.rating:
raise ValueError("若行为类型为评分则拼分必填")
return self
class RecommenderSystem:
"""基于协同过滤的推荐系统"""
def __init__(self, **arguments):
"""初始化推荐系统"""
# 行为参数配置项(不同行为类型配置不同的的兴趣基础分数和衰减权重)
# 行为参数配置项(根据行为类型配置兴趣基础分数和衰减权重)
self.behavior_arguments = {"rating": (None, 1)}
# 校验并解析初始化参数
@ -96,7 +96,7 @@ class RecommenderSystem:
# 用户行为数据储存对象
self.behaviors = defaultdict(
lambda: {
# 物品兴趣分数
# 物品兴趣分数记录
"scores": defaultdict(float),
# 时间戳最小堆
"timestamps_heap": [],
@ -105,11 +105,11 @@ class RecommenderSystem:
}
)
# 物品相似度时组件
# 物品的相似度组成成分
self.items_similarity_numerator = defaultdict(float)
self.items_similarity_denominator_square = defaultdict(float)
# 物品倒排索引
# 物品倒排索引
self.items_inversion = defaultdict(list)
# 用户特征向量数据储存对象
@ -120,6 +120,118 @@ class RecommenderSystem:
hash_size=8, input_dim=self.attributes_dimensions, num_hashtables=2
)
def process_behavior_record(self, behavior_record: dict) -> bool:
"""处理用户行为记录并更新推荐系统"""
# 校验并解析用户行为记录
behavior_record = BehaviorRecord(**behavior_record).model_dump()
user = behavior_record["user"]
timestamp = behavior_record["timestamp"]
type_ = behavior_record["type_"]
item = behavior_record["item"]
rating = behavior_record["rating"]
# 整理用户行为数据
self._reorganize_behaviors(user=user)
# 原兴趣分数
score_old = self.behaviors[user]["scores"][item]
# 现兴趣分数
score_new = self._calculate_score(
timestamp=timestamp, type_=type_, rating=rating
)
# 若现兴趣分数大于原兴趣分数则更新用户的物品兴趣分数记录、时间戳最小堆、历史物品记录和物品的相似度组成成分
if score_new > score_old:
self.behaviors[user]["scores"][item] = score_new
heapq.heappush(self.behaviors[user]["timestamps_heap"], (timestamp, item))
self.behaviors[user]["history_items"].append(item)
self._update_items_similarity_components(
user=user, item=item, score_old=score_old, score_new=score_new
)
# 若物品的倒排索引不存在该用户标识则新增
if user not in self.items_inversion[item]:
self.items_inversion[item].append(user)
return True
def _reorganize_behaviors(self, user) -> None:
"""基于时间窗口整理用户行为数据"""
# 时间戳最小堆
timestamps_heap = self.behaviors[user]["timestamps_heap"]
# 整理超过时间窗口的用户行为数据
while (
timestamps_heap
and (time.time() - timestamps_heap[0][0]) / 86400 > self.time_window
):
# 弹出最小的时间戳对应的物品标识(过期物品标识)
timestamp_expired, item_expired = heapq.heappop(timestamps_heap)
# 若用户的物品兴趣分数记录存在该过期物品标识则删除并更新物品的相似度组成成分
if item_expired in self.behaviors[user]["scores"][item_expired]:
score_expired = self.behaviors[user]["scores"][item_expired]
del self.behaviors[user]["scores"][item_expired]
self._update_items_similarity_components(
user=user, item=item_expired, score_old=score_expired, score_new=0
)
# 若用户的历史物品记录存在该过期物品标识则删除
if item_expired in self.behaviors[user]["history_items"]:
self.behaviors[user]["history_items"].remove(item_expired)
# 若物品的倒排索引存在该用户标识则删除
if user in self.items_inversion[item_expired]:
self.items_inversion[item_expired].remove(user)
# 若物品的倒排索引存为空则删除
if not self.items_inversion[item_expired]:
del self.items_inversion[item_expired]
def _calculate_score(self, timestamp, type_, rating) -> float:
"""基于时间衰减计算兴趣分数"""
# 时距(天)
time_interval = (time.time() - timestamp) / 86400
# 若时距大于时间窗口则返回0
if time_interval > self.time_window:
return 0
# 根据行为类型获取兴趣基础分数和衰减权重
score_base, weight = self.behavior_arguments[type_]
# 若行为类型为评分则将基础分数转化为0.20.8
if type_ == "rating":
score_base = 0.1 + 0.8 * (1 / (1 + numpy.exp(3 - rating)))
return score_base * numpy.exp(0 - time_interval * (self.decay_lambda * weight))
def _update_items_similarity_components(self, user, item, score_old, score_new):
"""更新物品相似度组成成分"""
for item_history in self.behaviors[user]["history_items"]:
if item_history != item:
# 构建有序物品标识对
pair = tuple(sorted((item_history, item)))
# 更新物品相似度的分子
self.items_similarity_numerator[pair] += (
score_new - score_old
) * self.behaviors[user]["scores"][item_history]
# 更新物品相似度的分母组成成分
self.items_similarity_denominator_square[item] += score_new**2 - score_old**2
def process_attributes_record(self, attributes_record: dict) -> bool:
"""处理用户属性记录"""
@ -135,154 +247,8 @@ class RecommenderSystem:
return True
# 处理用户行为记录
def process_behavior_record(self, behavior_record: dict):
# 校验行为记录数据模型,数据类型转为字典
behavior_record = BehaviorRecord(**behavior_record).model_dump()
# 用户标识
user = behavior_record["user"]
# 时间戳
timestamp = behavior_record["timestamp"]
# 行为类型
type = behavior_record["type"]
# 物品标识
item = behavior_record["item"]
# 评分若行为类型为评分则评分必填评分数据类型为整数指定范围1~5
rating = behavior_record["rating"]
# 整理用户行为数据
self._reorganize_behaviors(user=user)
# 原兴趣分数
score_past = self.behaviors[user]["scores"][item]
# 现兴趣分数
score = self._calculate_score(timestamp=timestamp, type=type, rating=rating)
# 若现兴趣分数大于原兴趣分数
if score > score_past:
# 更新兴趣分数列表
self.behaviors[user]["scores"][item] = score
# 更新时间戳堆
heapq.heappush(self.behaviors[user]["timestamps_heap"], (timestamp, item))
# 更新历史物品标识列表
self.behaviors[user]["items_history"].append(item)
# 更新计算物品标识-物品标识余弦相似度的分子和分子平方部分
self._update_items_similarity_components(
user=user, item=item, score_old=score_past, score_new=score
)
# 若用户标识在物品标识倒排表中索引为物品标识的用户标识列表
if user not in self.items_inversion[item]:
# 更新倒排表
self.items_inversion[item].append(user)
# 整理用户行为数据(基于时间窗口清理过期数据)
def _reorganize_behaviors(self, user):
# 时间戳堆
timestamps_heap = self.behaviors[user]["timestamps_heap"]
# 若时间戳堆非空列表且现在距时间戳堆中第一元组的时间戳的时距大于时间窗口
while (
timestamps_heap
and (time.time() - timestamps_heap[0][0]) / 86400 > self.time_window
):
# 删除时间戳堆中第一元组并获取过期时间戳和过期物品标识
timestamp_expired, item_expired = heapq.heappop(timestamps_heap)
# 若过期物品标识的兴趣分数非空
if self.behaviors[user]["scores"][item_expired]:
# 获取过期物品标识的兴趣分数
score_expired = self.behaviors[user]["scores"][item_expired]
# 在兴趣分数列表删除索引为过期物品标识的项
del self.behaviors[user]["scores"][item_expired]
# 若过期物品标识在历史物品标识列表
if item_expired in self.behaviors[user]["items_history"]:
# 在历史物品标识列表删除过期物品标识
self.behaviors[user]["items_history"].remove(item_expired)
# 更新更新计算物品标识-物品标识余弦相似度的分子和分子平方部分
self._update_items_similarity_components(
user=user, item=item_expired, score_old=score_expired, score_new=0
)
# 若用户标识在物品标识倒排表索引为过期物品标识的用户标识列表
if user in self.items_inversion[item_expired]:
# 在用户标识列表删除用户标识
self.items_inversion[item_expired].remove(user)
# 若物品标识倒排表中用户标识列表为空列表
if not self.items_inversion[item_expired]:
# 在物品标识倒排表删除索引为物品标识的项
del self.items_inversion[item_expired]
# 计算兴趣分数
def _calculate_score(self, timestamp, type, rating):
# 在时间窗口内,兴趣分数随时间以指数函数衰减
# 时距,单位为天
time_interval = (time.time() - timestamp) / 86400
# 若时距大于时间窗口
if time_interval > self.time_window:
return 0
# 基础兴趣分数,衰减因子权重
score_base, weight = self.behavior_arguments.get(type)
# 若行为类型为评分
if type == "rating":
# 基础兴趣分数经非线性转化为0.2至0.8
score_base = 0.2 + 0.6 * (1 / (1 + numpy.exp(3 - rating)))
# 加权后衰减因子兰布达系数
decay_lambda_weighted = self.decay_lambda * weight
# 基于指数函数计算兴趣评分
score = score_base * numpy.exp(0 - decay_lambda_weighted * time_interval)
return score
# 更新计算物品标识-物品标识余弦相似度的分子和分子平方部分
def _update_items_similarity_components(self, user, item, score_old, score_new):
for item_history in self.behaviors[user]["items_history"]:
if item_history != item:
# 构建物品标识-物品标识的有序物品标识对
pair = tuple(sorted((item_history, item)))
self.items_similarity_numerator[pair] += (
score_new - score_old
) * self.behaviors[user]["scores"][item_history]
self.items_similarity_denominator_square[item] += score_new**2 - score_old**2
# 生成推荐列表
def generate_recommendations(self, user, k=10):
def generate_recommendations(self, user, k: int = 10) -> dict:
"""生成TOP-K推荐列表"""
# 推荐物品标识列表
items_candidates = defaultdict(float)
@ -290,25 +256,19 @@ class RecommenderSystem:
# 整理用户行为数据
self._reorganize_behaviors(user=user)
# 最大候选数
maximum_candidates = k * 10
# 基于物品的协同过滤生成推荐物品标识列表
candidates_items = self._generate_items_candidates(user=user, k=k)
behaviors = self.behaviors[user]
print(candidates_items)
# 历史物品标识列表
items_history = behaviors["items_history"]
# 基于用户的协同过滤生成推荐物品标识列表
candidates_users = self._generate_users_candidates(user=user, k=k)
# 基于物品协同过滤算法生成的候选物品兴趣分数权重
alpha_weight = 0.2 / (1 + numpy.exp(0.05 * len(items_history) - 1.2)) + 0.65
# 基于物品协同过滤算法生成候选物品标识列表
candidates_items = self._generate_items_candidates(
user=user, maximum_candidates=maximum_candidates
)
# 基于用户协同过滤算法生成候选物品标识列表
candidates_users = self._generate_users_candidates(
user=user, maximum_candidates=maximum_candidates
# 动态调整基于物品的协同过滤权重
alpha_weight = (
0.4
/ (1 + numpy.exp(0.05 * len(self.behaviors[user]["history_items"]) - 1.2))
+ 0.5
)
# 合并基于物品协同过滤算法生成的候选物品标识列表和基于用户协同过滤算法生成候的选物品标识列表
@ -322,88 +282,67 @@ class RecommenderSystem:
islice(sorted(items_candidates.items(), key=itemgetter(1), reverse=True), k)
)
# 基于物品协同过滤算法生成候选物品标识列表
def _generate_items_candidates(self, user, maximum_candidates):
def _generate_items_candidates(self, user, k):
"""基于物品的协同过滤生成推荐物品标识列表"""
# 召回物品标识列表
items_recall = defaultdict(lambda: {"counts": 0, "scores": 0})
# 用户行为数据
behaviors = self.behaviors[user]
# 历史物品标识列表作为启发物品标识列表
items_heuristic = behaviors["items_history"]
# 启发物品标识列表
items_heuristic = behaviors["history_items"]
# 先通过启发式物品标识在物品标识倒排表查询索引为启发式物品标识的启发式用户标识,再通过启发式用户标识查询历史物品标识列表作为候选物品标识
# 先通过启发物品标识在该物品的倒排索引召回启发用户标识,再通过启发用户标识在该用户的历史物品记录召回物品标识
for item_heuristic in items_heuristic:
for user_heuristic in self.items_inversion[item_heuristic]:
# 若通过启发式物品标识在物品标识倒排表查询索引为启发式物品标识的启发式用户标识和用户标识不一致
if user_heuristic != user:
for item_recall in self.behaviors[user_heuristic]["items_history"]:
# 若召回物品标识不在启发物品标识列表
for item_recall in self.behaviors[user_heuristic]["history_items"]:
if item_recall not in items_heuristic:
items_recall[item_recall]["counts"] += 1
# 遍历启发式物品标识列表和召回物品标识列表(召回物品标识列表不可能包含启发式物品标识),计算余弦相似度
# 计算余弦相似度
for item_heuristic in items_heuristic:
# 在物品标识倒排表查询索引为启发式物品标识的用户标识列表
users_heuristic = self.items_inversion[item_heuristic]
for item_recall in items_recall:
# 在物品标识倒排表查询索引为召回物品标识的用户标识列表
users_recall = self.items_inversion[item_recall]
# 计算物品标识-物品标识余弦相似度时分母部分
# 物品的相似度组成成分的分母
items_similarity_denominator = numpy.sqrt(
self.items_similarity_denominator_square[item_heuristic]
* self.items_similarity_denominator_square[item_recall]
)
# 计算物品标识-物品标识余弦相似度时分母部分不为0
if items_similarity_denominator != 0:
# 构建有序物品标识对
pair = tuple(sorted((item_heuristic, item_recall)))
# 构建物品标识-物品标识的有序物品标识对
pair = tuple(sorted((item_heuristic, item_recall)))
# 余弦相似度
similarity = (
similarity = (
(
self.items_similarity_numerator[pair]
/ items_similarity_denominator
)
else:
similarity = 0
# 在物品标识倒排表查询索引为历史物品标识的用户标识列表和在物品标识倒排表查询索引为召回物品标识的用户标识列表共同用户标识列表
users_common = list(set(users_heuristic) & set(users_recall))
# 抑制流行物品因子
popularity_suppressed = len(users_common) / numpy.sqrt(
len(users_heuristic) * len(users_recall)
if items_similarity_denominator != 0
else 0
)
# 流行度抑制因子
popularity_suppressed = len(
list(set(users_heuristic) & set(users_recall))
) / numpy.sqrt(len(users_heuristic) * len(users_recall))
# 加权物品的相似度
items_recall[item_recall]["scores"] += (
behaviors["scores"][item_heuristic]
* similarity
* popularity_suppressed
)
# 归一化候选物品标识列表
candidates = self._normalize_candidates(
items_recall=items_recall, maximum_candidates=maximum_candidates
)
return candidates
return self._normalize_scores(items_recall=items_recall, k=k)
# 基于用户协同过滤算法生成候选物品标识列表
def _generate_users_candidates(self, user, maximum_candidates):
def _generate_users_candidates(self, user, k):
# 召回物品标识列表
items_recall = defaultdict(lambda: {"counts": 0, "scores": 0})
@ -416,7 +355,7 @@ class RecommenderSystem:
# 基于LSHash查询与用户特征向量相似的用户标识作为召回用户标识
for _, similarity, user_recall in self.attributes_indexer.query(
query_vector=attributes,
num_results=maximum_candidates,
num_results=k,
dist_func="cosine",
):
@ -435,60 +374,48 @@ class RecommenderSystem:
)
# 归一化候选物品标识列表
candidates = self._normalize_candidates(
items_recall=items_recall, maximum_candidates=maximum_candidates
)
candidates = self._normalize_scores(items_recall=items_recall, k=k)
return candidates
# 归一化候选物品标识列表
def _normalize_candidates(self, items_recall, maximum_candidates):
@staticmethod
def _normalize_scores(items_recall, k):
"""归一化候选物品的相似度"""
# 候选物品标识列表
candidates = defaultdict(float)
# 若召回物品标识列表非空字典
print(items_recall)
if items_recall:
scores = [value["scores"] for value in items_recall.values()]
# 候选物品兴趣分数
scores = [nest["scores"] for nest in items_recall.values()]
# 候选物品相似分数最小值
scores_minimum = min(scores, default=0)
# 候选物品相似分数最大值
scores_maximum = max(scores, default=0)
# 候选物品相似度的最小值和最大值
scores_minimum, scores_maximum = min(scores, default=0), max(
scores, default=0
)
# 值距
scores_range = scores_maximum - scores_minimum
# 若值距不为0
if scores_range != 0:
# 兴趣分数归一化
for item_recall in items_recall:
# 基于最小值-最大值归一化
for item_recall in items_recall:
# 若值距不为0
if scores_range != 0:
candidates[item_recall] = (
(items_recall[item_recall]["scores"] - scores_minimum)
/ scores_range
) * 0.6 + 0.2
else:
# 兴趣分数默认为0.8
for item_recall in items_recall:
else:
candidates[item_recall] = 0.8
# 根据兴趣分数倒序排序并截取
candidates = dict(
islice(
sorted(candidates.items(), key=itemgetter(1), reverse=True),
maximum_candidates,
)
return dict(
islice(
sorted(candidates.items(), key=itemgetter(1), reverse=True),
k * 10,
)
return candidates
)
if __name__ == "__main__":
@ -500,42 +427,42 @@ if __name__ == "__main__":
{
"user": "aaaaaa",
"item": "111111",
"type": "rating",
"type_": "rating",
"timestamp": int(time.time() - 3600),
"rating": 4,
},
{
"user": "aaaaaa",
"item": "333333",
"type": "rating",
"type_": "rating",
"timestamp": int(time.time() - 3200),
"rating": 4,
},
{
"user": "bbbbbb",
"item": "333333",
"type": "rating",
"type_": "rating",
"timestamp": int(time.time() - 3200),
"rating": 4,
},
{
"user": "cccccc",
"item": "111111",
"type": "rating",
"type_": "rating",
"timestamp": int(time.time() - 3200),
"rating": 5,
},
{
"user": "cccccc",
"item": "222222",
"type": "rating",
"type_": "rating",
"timestamp": int(time.time() - 3200),
"rating": 5,
},
{
"user": "cccccc",
"item": "333333",
"type": "rating",
"item": "444444",
"type_": "rating",
"timestamp": int(time.time() - 3200),
"rating": 3,
},
@ -545,7 +472,7 @@ if __name__ == "__main__":
recommender.process_behavior_record(behavior_record=feedback_record)
a = recommender.generate_recommendations(user="cccccc")
a = recommender.generate_recommendations(user="aaaaaa")
print(a)