添加目录和文件

This commit is contained in:
marslbr 2025-10-29 12:54:12 +08:00
parent af2aab0262
commit 286d818f34
11 changed files with 7194 additions and 0 deletions

4
.idea/misc.xml Normal file
View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.11 virtualenv at ~/private" project-jdk-type="Python SDK" />
</project>

516
API/main.py Normal file
View File

@ -0,0 +1,516 @@
# -*- coding: utf-8 -*-
"""基于FastAPI和Uvicorn的RESTfulAPI服务"""
from contextlib import asynccontextmanager
from datetime import date, datetime
from typing import AsyncGenerator, Literal, Optional, Union
from urllib.parse import quote_plus
import pytz
from distance import levenshtein
from fastapi import Depends, FastAPI, HTTPException, Header, status
from fastapi.exceptions import RequestValidationError
from fastapi.responses import JSONResponse
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
from lunarcalendar import Lunar
from pydantic import BaseModel, Field, field_validator
from sqlalchemy import create_engine, select, text
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
from sqlalchemy.ext.automap import automap_base
# -------------------------
# 应用初始化配置
# -------------------------
# 创建FastAPI对象
application = FastAPI(
title="api.liubiren.cloud",
description="刘弼仁工作室的请求服务中心如需使用请联系liubiren@qq.com",
version="0.0.1",
swagger_ui_init_oauth={
"clientId": "api-docs-client",
"scopes": {},
"usePkceWithAuthorizationCodeGrant": True,
},
redoc_url=None,
)
# 配置全局安全策略(所有请求均需认证)
application.openapi_security = [{"BearerAuth": []}]
# -------------------------
# 请求和响应模型
# -------------------------
class Request(BaseModel):
"""统一请求模型"""
service: Literal["divination", "query_institution", "query_drug"] = Field(
...,
description="服务标识,数据类型为枚举,必填",
json_schema_extra={
"枚举描述": {
"divination": "小六壬速断",
"query_institution": "根据名称精准查询医药机构信息",
}
},
)
data: Union["DivinationRequest", "QueryInstitutionRequest", "QueryDrugRequest"] = (
Field(
...,
description="请求数据模型,根据服务标识传入相应的请求数据模型",
)
)
# 根据服务标识校验请求数据模型
# noinspection PyNestedDecorators
@field_validator("data")
@classmethod
def validate_data(cls, value, values):
service = values.data.get("service")
if service == "divination" and not isinstance(value, DivinationRequest):
raise ValueError("小六壬速断服务需要 DivinationRequest 请求数据模型")
if service == "query_institution" and not isinstance(
value, QueryInstitutionRequest
):
raise ValueError(
"根据名称精准查询医药机构信息服务需要 QueryInstitutionRequest 请求数据模型"
)
if service == "query_drug" and not isinstance(value, QueryDrugRequest):
raise ValueError(
"根据类型和名称模糊查询药品信息服务需要 QueryDrugRequest 请求数据模型"
)
return value
class Config:
json_schema_extra = {
"example": {
"service": "query_institution",
"data": {"name": "浙江大学医学院附属第一医院"},
}
}
class Response(BaseModel):
"""统一响应模型"""
code: int = Field(
default=0, description="错误码0表示成功其它表示发生错误或异常"
)
message: str = Field(
default="成功",
description="错误描述",
)
data: Union[
"DivinationResponse", "QueryInstitutionResponse", "QueryDrugResponse"
] = Field(default=None, description="响应数据模型")
# noinspection PyUnusedLocal
@application.exception_handler(RequestValidationError)
async def validation_exception_handler(request: Request, error: RequestValidationError):
return JSONResponse(
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
content=Response(
code=422,
message="校验模型失败",
).model_dump(),
)
# noinspection PyUnusedLocal
@application.exception_handler(HTTPException)
async def http_exception_handler(request: Request, exception: HTTPException):
return JSONResponse(
status_code=exception.status_code,
content=Response(
code=exception.status_code,
message="请求发生异常",
).model_dump(),
)
# noinspection PyUnusedLocal
@application.exception_handler(Exception)
async def general_exception_handler(request: Request, exception: Exception):
return JSONResponse(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
content=Response(
code=500,
message="服务内部发生异常",
).model_dump(),
)
class DivinationRequest(BaseModel):
"""小六壬速断的请求数据模型"""
pass
class DivinationResponse(BaseModel):
"""小六壬速断的响应数据模型"""
fallen_palace: str = Field(
..., description="落宫,数据类型为字符串,非空", examples=["小吉"]
)
divination_verse: str = Field(
...,
description="卦辞,数据类型为字符串,非空",
examples=["小吉最吉昌,路上好商量,阴人来报喜,失物在坤方"],
)
class QueryInstitutionRequest(BaseModel):
"""根据名称精准查询医药机构信息的请求数据模型"""
name: str = Field(
...,
max_length=255,
description="医药机构名称,数据类型为字符串,非空",
examples=["浙江大学医学院附属第一医院"],
)
# noinspection PyNestedDecorators
@field_validator("name")
@classmethod
def validate_name(cls, value: str) -> str:
"""删除名称前后空格"""
return value.strip()
class QueryInstitutionResponse(BaseModel):
"""根据名称精准查询医药机构信息的响应数据模型"""
name: str = Field(
...,
description="机构名称,数据类型为字符串,非空",
examples=["浙江大学医学院附属第一医院"],
)
province: str = Field(
...,
description="机构所在省,数据类型为字符串,非空",
examples=["浙江省"],
)
city: str = Field(
...,
description="机构所在地,数据类型为字符串,非空",
examples=["杭州市"],
)
type: str = Field(
...,
description="机构类型,数据类型为字符串,非空",
examples=["医疗机构"],
)
incurred: str = Field(
...,
description="是否为医保定点机构,数据类型为字符串,非空",
examples=[""],
)
level: Optional[str] = Field(
None,
description="机构等级,数据类型为字符串,可空",
examples=["三级甲等"],
)
attribute: Optional[str] = Field(
None,
description="机构属性,数据类型为字符串,可空",
examples=["公立医院、非营利性医院"],
)
class QueryDrugRequest(BaseModel):
"""根据类型和名称模糊查询药品信息的请求数据模型"""
type: Literal["西药", "中草药", "中成药"] = Field(
...,
description="药品类型,数据类型为枚举,非空",
examples=["西药"],
)
name: str = Field(
...,
max_length=255,
description="药品名称,数据类型为字符串,非空",
examples=["[达悦宁]盐酸二甲双胍缓释片 0.5*30"],
)
# noinspection PyNestedDecorators
@field_validator("name")
@classmethod
def validate_name(cls, value: str) -> str:
"""删除名称前后空格"""
return value.strip()
class QueryDrugResponse(BaseModel):
"""根据类型和名称模糊查询药品信息的响应数据模型"""
name: str = Field(
...,
description="药品名称,数据类型为字符串,非空",
examples=["盐酸二甲双胍缓释片"],
)
# -------------------------
# 依赖项与工具函数
# -------------------------
async def authenticate_headers(
credentials: HTTPAuthorizationCredentials = Depends(HTTPBearer()),
content_type: str = Header(
default="application/json; charset=utf-8",
description="媒体类型",
),
) -> bool:
"""校验请求头中Content-Type和Authorization Bearer token"""
if "application/json" not in content_type:
raise HTTPException(status_code=415, detail="只接受JSON格式数据")
if not credentials or credentials.credentials != "779E0501265CDF7B8124EB87199994B8":
raise HTTPException(status_code=403, detail="认证失败")
return True
# 创建MySQL连接引擎默认使用DATABASE数据库
engine = create_async_engine(
url=f"mysql+asyncmy://root:{quote_plus('Te198752')}@cdb-7z9lzx4y.cd.tencentcdb.com:10039/database?charset=utf8",
pool_size=10, # 连接池常驻连接数
max_overflow=10, # 连接池最大溢出连接数
pool_recycle=3600, # 连接回收时间(秒)
pool_pre_ping=True, # 连接前验证有效性
)
# 创建ORM对象
Base = automap_base()
Base.prepare(
autoload_with=create_engine(
f"mysql+pymysql://root:{quote_plus('Te198752')}@cdb-7z9lzx4y.cd.tencentcdb.com:10039/database?charset=utf8"
)
)
# 初始化MySQL会话工厂
AsyncSessionLocal = async_sessionmaker(bind=engine, expire_on_commit=False)
@asynccontextmanager
async def create_session() -> AsyncGenerator[AsyncSession, None]:
"""数据库会话上下文管理器"""
async with AsyncSessionLocal() as session:
try:
yield session
await session.commit()
except:
await session.rollback()
raise
finally:
await session.close()
# -------------------------
# 服务路由
# -------------------------
@application.post(
path="/",
dependencies=[Depends(authenticate_headers)],
response_model=Response,
response_description="响应成功",
responses={
200: {
"model": Response,
"content": {
"application/json": {
"example": {
"code": 200,
"message": "医药机构信息不存在",
"data": None,
}
}
},
"description": "不存在",
},
422: {
"model": Response,
"content": {
"application/json": {
"example": {"code": 422, "message": "校验模型失败", "data": None}
}
},
"description": "校验模型失败",
},
500: {
"model": Response,
"content": {
"application/json": {
"example": {
"code": 500,
"message": "服务内部发生异常",
"data": None,
}
}
},
"description": "服务内部发生异常",
},
},
name="服务中心",
description="所有请求均由本中心提供响应服务",
)
async def service(
request: Request,
) -> Response:
# 根据服务标识匹配服务
# noinspection PyUnreachableCode
match request.service:
case "divination":
return await divination()
case "query_institution":
return await query_institution(request)
case "query_drug":
return await query_drug(request)
case _:
return Response(code=400, message="无效的服务标识")
async def divination() -> Response:
"""小六壬速断"""
# 起算日期时间
starting = datetime.now(tz=pytz.timezone("Asia/Shanghai"))
# 起算日期转为农历
lunar = Lunar.from_date(date(starting.year, starting.month, starting.day))
# 根据农历月日和时辰匹配落宫和卦辞
divination = [
{
"fallen_palace": "空亡",
"divination_verse": "空亡事不详,阴人少乖张,求财无利益,行人有灾殃",
},
{
"fallen_palace": "大安",
"divination_verse": "大安事事昌,求财在坤方,失物去不远,宅舍保安康",
},
{
"fallen_palace": "留连",
"divination_verse": "留连事难成,求谋日未明,官事只宜缓,去者未回程",
},
{
"fallen_palace": "速喜",
"divination_verse": "速喜喜来临,求财向南行,失物申午未,寻人路上寻",
},
{
"fallen_palace": "赤口",
"divination_verse": "赤口主口舌,是非要紧防,失物速速讨,行人有惊慌",
},
{
"fallen_palace": "小吉",
"divination_verse": "小吉最吉昌,路上好商量,阴人来报喜,失物在坤方",
},
][
(lunar.month + lunar.day + ((starting.hour + 3) // 2) % 12 + 4) % 6
] # 需先将24制小时转为时辰再根据月、日和时辰数落宫
return Response(
data=DivinationResponse(
fallen_palace=divination["fallen_palace"],
divination_verse=divination["divination_verse"],
)
)
async def query_institution(request: Request) -> Response:
"""根据名称精准查询医药机构信息"""
async with create_session() as session:
# noinspection PyTypeChecker
institution = (
await session.execute(
select(Base.classes.institution)
.join(Base.classes.institution_alias)
.where(Base.classes.institution_alias.name == request.data.name)
)
).scalar_one_or_none()
if institution is None:
return Response(code=204, message="医药机构信息不存在")
return Response(
data=QueryInstitutionResponse(
name=institution.name,
province=institution.province,
city=institution.city,
type=institution.type,
incurred=institution.incurred,
level=institution.level,
attribute=institution.attribute,
)
)
async def query_drug(request: Request) -> Response:
"""根据类型和名称模型查询药品信息"""
async with create_session() as session:
# 基于MySQL全文检索能力召回与检索词高度相关的药品名称
drugs_name = (
(
await session.execute(
text(
"""
SELECT name
FROM drug
WHERE MATCH(name) AGAINST(:name IN NATURAL LANGUAGE MODE) AND type = :type
ORDER BY
(name = :name) DESC,
MATCH(name) AGAINST(:name) DESC,
LENGTH(name) ASC
LIMIT 10
"""
).bindparams(type=request.data.type, name=request.data.name)
)
)
.scalars()
.all()
)
result = None
for drug_name in drugs_name:
# 若检索词包含药品名称则以此作为结果
if drug_name in request.data.name:
result = drug_name
break
# 若
if result is None:
round((1 - levenshtein(string1, string2) / (len(string1) + len(string2))) * 100)
print(drugs_name)
if not drugs_name:
return Response(code=204, message="药品信息不存在")
return Response(
data=QueryDrugResponse(
name=drugs_name[0],
)
)
"""
"""

197
regions/main.py Normal file
View File

@ -0,0 +1,197 @@
# -*- coding: utf-8 -*-
'''
脚本说明
根据行政区划数据就待转化数据集进行转化
备注
行政区划数据集来源于 https://lbsyun.baidu.com/faq/api?title=webapi/download
'''
import re
import json
import numpy
import pandas
import time
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
from utils.pandas_extension import SaveAsExcel
print('1、读取Excel并创建数据集...', end = '')
try:
#待转化数据集
dataset = pandas.read_excel(io = 'dataset.xlsx', sheet_name = 'Sheet1')
#行政区划数据集
dataset_regions = pandas.read_excel(io = 'dataset.xlsx', sheet_name = 'Sheet2')
except:
print('读取Excel或创建数据集发生异常脚本终止')
print()
exit()
print('已完成')
print()
print('2、转化数据')
print()
print('2.1 基于行政区划数据集生成省级、地级和县级行政区字典', end = '')
#县级名称和县级编码
regions = dataset_regions[['省级名称', '省级编码', '地级名称', '地级编码', '县级名称', '县级编码']].drop_duplicates()
print('已完成')
print()
print('2.2 遍历并转化数据...', end = '')
dataset.replace(to_replace = {numpy.nan: pandas.NA, None: pandas.NA, '': pandas.NA}, inplace = True)
for index, row in dataset.iterrows():
province_name = row['省名称']
if province_name is not pandas.NA:
try:
#根据省名称匹配省级行政区字典并填充省区划编码
row['省区划编码'] = str(dataset_regions.loc[dataset_regions['省级名称'] == province_name, '省级编码'].iat[0])
except:
row['省区划编码'] = '未查询到省区划编码'
row['与区划编码对比结果'] = '待确定'
#省编码数据类型转为字符
row['省编码'] = str(int(row['省编码']))
else:
row['省区划编码'] = '省名称为空'
row['省编码'] = '省名称为空'
row['省名称'] = '省名称为空'
row['与区划编码对比结果'] = '待确定'
city_name = row['市名称']
if row['区划类型'] != '':
if city_name is not pandas.NA:
try:
#根据市名称匹配地级行政区字典并填充市区划编码
row['市区划编码'] = str(dataset_regions.loc[dataset_regions['地级名称'] == city_name, '地级编码'].iat[0])
except:
row['市区划编码'] = '未查询到市区划编码'
row['与区划编码对比结果'] = '待确定'
#市编码数据类型转为字符
row['市编码'] = str(int(row['市编码']))
else:
row['市区划编码'] = '市名称为空'
row['市编码'] = '市名称为空'
row['市名称'] = '市名称为空'
row['与区划编码对比结果'] = '待确定'
else:
row['市区划编码'] = ''
row['市编码'] = ''
row['市名称'] = ''
region_name = row['区县名称']
if row['区划类型'] == '区县':
if region_name is not pandas.NA:
try:
#根据区县名称匹配县级行政区字典并填充区县区划编码
row['区县区划编码'] = str(regions.loc[regions['县级名称'] == region_name, '县级编码'].iat[0])
if row['省名称'] == '省名称为空' or row['市名称'] == '市名称为空':
#若省名称或市名称为空则补充说明
row['与区划编码对比说明'] = '该区县所属{}/{}'.format(str(regions.loc[regions['县级名称'] == region_name, '省级名称'].iat[0]), str(regions.loc[regions['县级名称'] == region_name, '地级名称'].iat[0]))
except:
row['区县区划编码'] = '未查询到区县区划编码'
row['与区划编码对比结果'] = '待确定'
#县编码数据类型转为字符
row['区县编码'] = str(int(row['区县编码']))
else:
row['区县区划编码'] = '区县名称为空'
row['区县编码'] = '区县名称为空'
row['区县名称'] = '区县名称为空'
row['与区划编码对比结果'] = '待确定'
else:
row['区县区划编码'] = ''
row['区县编码'] = ''
row['区县名称'] = ''
dataset.iloc[index] = row
dataset.fillna(value = '', inplace = True)
print('已完成')
print()
print('正在保存为EXCEL...', end = '')
SaveAsExcel(worksheets = [('Sheet1', dataset)], save_path = 'results.xlsx')
print('已完成')
print()
'''
修改记录
'''

View File

@ -0,0 +1,466 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Awesome-pyecharts</title>
<script type="text/javascript" src="https://assets.pyecharts.org/assets/v5/echarts.min.js"></script>
</head>
<body >
<div id="6aa752c17a434bbea4057e902205080b" class="chart-container" style="width:500px; height:350px; "></div>
<script>
var chart_6aa752c17a434bbea4057e902205080b = echarts.init(
document.getElementById('6aa752c17a434bbea4057e902205080b'), 'white', {renderer: 'canvas'});
var option_6aa752c17a434bbea4057e902205080b = {
"animation": true,
"animationThreshold": 2000,
"animationDuration": 1000,
"animationEasing": "cubicOut",
"animationDelay": 0,
"animationDurationUpdate": 300,
"animationEasingUpdate": "cubicOut",
"animationDelayUpdate": 0,
"aria": {
"enabled": false
},
"color": [
"#165DFF",
"#14C9C9",
"#5470c6",
"#91cc75",
"#fac858",
"#ee6666",
"#73c0de",
"#3ba272",
"#fc8452",
"#9a60b4",
"#ea7ccc"
],
"series": [
{
"type": "line",
"name": "\u67ef\u65af\u7edf\u8ba1\u91cf",
"connectNulls": false,
"xAxisIndex": 0,
"yAxisIndex": 0,
"symbol": "circle",
"symbolSize": 6,
"showSymbol": true,
"smooth": true,
"clip": true,
"step": false,
"data": [
[
"[0, 350)",
28.42
],
[
"[350, 400)",
39.84
],
[
"[400, 450)",
48.37
],
[
"[450, 500)",
55.09
],
[
"[500, 550)",
52.59
],
[
"[550, 600)",
39.62
],
[
"[600, 650)",
13.59
],
[
"[650, 1000)",
0.0
]
],
"hoverAnimation": true,
"label": {
"show": true,
"position": "left",
"color": "#86909C",
"margin": 8,
"fontSize": "12px",
"fontWeight": "bold",
"fontFamily": "PingFang SC"
},
"logBase": 10,
"seriesLayoutBy": "column",
"lineStyle": {
"show": true,
"width": 1,
"opacity": 1,
"curveness": 0,
"type": "solid"
},
"areaStyle": {
"opacity": 0
},
"zlevel": 0,
"z": 0
},
{
"type": "line",
"name": "\u63d0\u5347\u7edf\u8ba1\u91cf",
"connectNulls": false,
"xAxisIndex": 0,
"yAxisIndex": 1,
"symbol": "circle",
"symbolSize": 6,
"showSymbol": true,
"smooth": true,
"clip": true,
"step": false,
"data": [
[
"[0, 350)",
7.28
],
[
"[350, 400)",
6.24
],
[
"[400, 450)",
5.07
],
[
"[450, 500)",
3.3
],
[
"[500, 550)",
2.34
],
[
"[550, 600)",
1.65
],
[
"[600, 650)",
1.15
],
[
"[650, 1000)",
1.0
]
],
"hoverAnimation": true,
"label": {
"show": true,
"position": "top",
"color": "#86909C",
"margin": 8,
"fontSize": "12px",
"fontWeight": "bold",
"fontFamily": "PingFang SC"
},
"logBase": 10,
"seriesLayoutBy": "column",
"lineStyle": {
"show": true,
"width": 1,
"opacity": 1,
"curveness": 0,
"type": "solid"
},
"areaStyle": {
"opacity": 0
},
"zlevel": 1,
"z": 0
}
],
"legend": [
{
"data": [
"\u67ef\u65af\u7edf\u8ba1\u91cf",
"\u63d0\u5347\u7edf\u8ba1\u91cf"
],
"selected": {},
"show": true,
"left": "center",
"top": "top",
"orient": "horizontal",
"align": "auto",
"padding": 5,
"itemGap": 10,
"itemWidth": 10,
"itemHeight": 10,
"inactiveColor": "#86909C",
"textStyle": {
"color": "#86909C",
"fontStyle": "normal",
"fontWeight": "normal",
"fontFamily": "PingFang SC",
"fontSize": "12px"
},
"backgroundColor": "transparent",
"borderColor": "#ccc",
"borderWidth": 0,
"borderRadius": 0,
"pageButtonItemGap": 5,
"pageButtonPosition": "end",
"pageFormatter": "{current}/{total}",
"pageIconColor": "#2f4554",
"pageIconInactiveColor": "#aaa",
"pageIconSize": 15,
"animationDurationUpdate": 800,
"selector": false,
"selectorPosition": "auto",
"selectorItemGap": 7,
"selectorButtonGap": 10
}
],
"tooltip": {
"show": false,
"trigger": "item",
"triggerOn": "mousemove|click",
"axisPointer": {
"type": "line"
},
"showContent": true,
"alwaysShowContent": false,
"showDelay": 0,
"hideDelay": 100,
"enterable": false,
"confine": false,
"appendToBody": false,
"transitionDuration": 0.4,
"textStyle": {
"fontSize": 14
},
"borderWidth": 0,
"padding": 5,
"order": "seriesAsc"
},
"xAxis": [
{
"name": "\u5206\u7bb1",
"show": true,
"scale": false,
"nameLocation": "end",
"nameGap": 15,
"nameTextStyle": {
"color": "#86909C",
"fontStyle": "normal",
"fontWeight": "bold",
"fontFamily": "PingFang SC",
"fontSize": "14px"
},
"gridIndex": 0,
"axisLine": {
"show": true,
"onZero": true,
"onZeroAxisIndex": 0,
"lineStyle": {
"show": true,
"width": 1,
"opacity": 1,
"curveness": 0,
"type": "solid",
"color": "#86909C"
}
},
"axisTick": {
"show": true,
"alignWithLabel": false,
"inside": true,
"lineStyle": {
"show": true,
"width": 1,
"opacity": 1,
"curveness": 0,
"type": "solid",
"color": "#86909C"
}
},
"axisLabel": {
"show": true,
"position": "inside",
"color": "#86909C",
"rotate": 30,
"margin": 8,
"fontSize": "12px",
"fontWeight": "normal",
"fontFamily": "PingFang SC"
},
"inverse": false,
"offset": 0,
"splitNumber": 5,
"minInterval": 0,
"splitLine": {
"show": false,
"lineStyle": {
"show": true,
"width": 1,
"opacity": 1,
"curveness": 0,
"type": "dashed",
"color": "#E5E6EB"
}
},
"data": [
"[0, 350)",
"[350, 400)",
"[400, 450)",
"[450, 500)",
"[500, 550)",
"[550, 600)",
"[600, 650)",
"[650, 1000)"
]
}
],
"yAxis": [
{
"show": false,
"scale": false,
"nameLocation": "end",
"nameGap": 15,
"nameTextStyle": {
"color": "#86909C",
"fontStyle": "normal",
"fontWeight": "normal",
"fontFamily": "PingFang SC",
"fontSize": "12px"
},
"gridIndex": 0,
"axisLine": {
"show": true,
"onZero": true,
"onZeroAxisIndex": 0,
"lineStyle": {
"show": true,
"width": 1,
"opacity": 1,
"curveness": 0,
"type": "solid",
"color": "#86909C"
}
},
"axisTick": {
"show": false,
"alignWithLabel": false,
"inside": true,
"lineStyle": {
"show": true,
"width": 1,
"opacity": 1,
"curveness": 0,
"type": "solid",
"color": "#86909C"
}
},
"axisLabel": {
"color": "#86909C",
"fontStyle": "normal",
"fontWeight": "normal",
"fontFamily": "PingFang SC",
"fontSize": "12px"
},
"inverse": false,
"offset": 0,
"splitNumber": 5,
"minInterval": 0,
"splitLine": {
"show": false,
"lineStyle": {
"show": true,
"width": 1,
"opacity": 1,
"curveness": 0,
"type": "dashed",
"color": "#E5E6EB"
}
}
},
{
"show": false,
"scale": false,
"nameLocation": "end",
"nameGap": 15,
"nameTextStyle": {
"color": "#86909C",
"fontStyle": "normal",
"fontWeight": "normal",
"fontFamily": "PingFang SC",
"fontSize": "12px"
},
"gridIndex": 0,
"axisLine": {
"show": true,
"onZero": true,
"onZeroAxisIndex": 0,
"lineStyle": {
"show": true,
"width": 1,
"opacity": 1,
"curveness": 0,
"type": "solid",
"color": "#86909C"
}
},
"axisTick": {
"show": false,
"alignWithLabel": false,
"inside": true,
"lineStyle": {
"show": true,
"width": 1,
"opacity": 1,
"curveness": 0,
"type": "solid",
"color": "#86909C"
}
},
"axisLabel": {
"color": "#86909C",
"fontStyle": "normal",
"fontWeight": "normal",
"fontFamily": "PingFang SC",
"fontSize": "12px"
},
"inverse": false,
"offset": 0,
"splitNumber": 5,
"minInterval": 0,
"splitLine": {
"show": false,
"lineStyle": {
"show": true,
"width": 1,
"opacity": 1,
"curveness": 0,
"type": "dashed",
"color": "#E5E6EB"
}
}
}
],
"title": [
{
"show": true,
"target": "blank",
"subtarget": "blank",
"padding": 5,
"itemGap": 10,
"textAlign": "auto",
"textVerticalAlign": "auto",
"triggerEvent": false
}
]
};
chart_6aa752c17a434bbea4057e902205080b.setOption(option_6aa752c17a434bbea4057e902205080b);
</script>
</body>
</html>

398
rfm/main.py Normal file
View File

@ -0,0 +1,398 @@
# -*- coding: utf-8 -*-
if __name__ == "__main__":
"""
基于RFM模型生成数据分析报告
"""
# 导入模块
import pandas
from datetime import datetime
from decimal import Decimal, ROUND_HALF_UP
import statistics
from jinja2 import Environment, FileSystemLoader
from utils.client import MySQLClient
from utils.pandas_extension import DrawAsHTML
# 函数说明根据RFM编码映射为客户分类
def map_classification(r_encoded, f_encoded, m_encoded):
# 就R、F、M指标构建独热编码并匹配客户分类
match f"{r_encoded}{f_encoded}{m_encoded}":
case "000":
classification = "流失客户"
case "010":
classification = "一般维持客户"
case "100":
classification = "新客户"
case "110":
classification = "潜力客户"
case "001":
classification = "重要挽留客户"
case "101":
classification = "重要深耕客户"
case "011":
classification = "重要唤回客户"
case "111":
classification = "重要价值客户"
# noinspection PyUnboundLocalVariable
return classification
print("1 加载数据集...", end="")
client = MySQLClient(database="data_analysis")
dataframe = client.execute_query(
sql="select 客户ID, 交易金额, 交易日期 from rfm_dataset"
) # customer_id 客户ID STRINGtrade_date 交易日期 DATETIME.DATEtrade_amount 交易金额 DECIMAL
print("已完成")
print("2 预处理,删除包含缺失值的样本和重复样本...", end="")
# 删除包含缺失值的样本
dataframe.dropna(inplace=True)
# 删除重复样本(保留第一例重复样本、重置索引)
dataframe = dataframe.drop_duplicates(ignore_index=True, inplace=False)
# 仅保留交易日期为2012和2013年的样本
dataframe = dataframe[
dataframe["交易日期"].apply(lambda x: x.year in [2012, 2013])
].reset_index(
drop=True
) # 因交易日期数据类型为DATETIME.DATE非DATETIME64故无法使用SERIES.DT.YEAR方法
sample_size = Decimal(dataframe.shape[0]).quantize(Decimal("0"))
print("已完成")
print("3 构建RFM...", end="")
# 最远交易日期
min_trade_date = dataframe["交易日期"].min()
# R为最近一次交易日期距离样本中最远一次交易日期的天数单位DECIMALF为交易频率单位DECIMALM为交易金额单位DECIMAL。均正向化
rfm = (
dataframe.groupby(by="客户ID")
.agg(
R=(
"交易日期",
lambda x: Decimal((x.max() - min_trade_date).days).quantize(
Decimal("0"), rounding=ROUND_HALF_UP
),
),
F=(
"客户ID",
lambda x: Decimal(len(x)).quantize(
Decimal("0"), rounding=ROUND_HALF_UP
),
),
M=(
"交易金额",
lambda x: sum(x, Decimal("0")).quantize(
Decimal("0.00"), rounding=ROUND_HALF_UP
), # 求和时指定初始值为DECIMAL("0")
),
)
.reset_index()
)
# 客户数
customer_counts = Decimal(rfm.shape[0]).quantize(Decimal("0"))
# 总交易金额
trade_amounts = sum(rfm["M"], Decimal("0.00")).quantize(Decimal("0.00"))
print("已完成")
print("4 基于平均数将R、F和M分为低、高两个等级并组合为八种客户分类...", end="")
# R、F和M的平均数使用STATISTICS.MEAN统计平均值保证精度
# noinspection PyUnresolvedReferences
means = {
"R": statistics.mean(rfm["R"]).quantize(
Decimal("0.00"), rounding=ROUND_HALF_UP
),
"F": statistics.mean(rfm["F"]).quantize(
Decimal("0.00"), rounding=ROUND_HALF_UP
),
"M": statistics.mean(rfm["M"]).quantize(
Decimal("0.00"), rounding=ROUND_HALF_UP
),
}
rfm = rfm.assign(
客户分类=lambda dataframe: dataframe.apply(
lambda row: map_classification(
r_encoded=0 if row["R"] <= means["R"] else 1,
f_encoded=0 if row["F"] <= means["F"] else 1,
m_encoded=0 if row["M"] <= means["M"] else 1,
),
axis="columns",
)
)
dataframe = dataframe.merge(
right=rfm[["客户ID", "客户分类"]], on="客户ID", how="left"
)
print("已完成")
print("5 生成分析报告...", end="")
draw = DrawAsHTML()
# 生成数据预览
draw.table(
dataframe=dataframe.sample(5),
file_name="数据预览.html",
)
# 客户分类维度
customer_types = (
rfm.groupby(by="客户分类") # 按照客户分类分组
.agg(
R=(
"R",
lambda x: statistics.mean(x).quantize(
Decimal("0.00"), rounding=ROUND_HALF_UP
),
), # R平均值
F=(
"F",
lambda x: statistics.mean(x).quantize(
Decimal("0.00"), rounding=ROUND_HALF_UP
),
), # F平均值
M=(
"M",
lambda x: statistics.mean(x).quantize(
Decimal("0.00"), rounding=ROUND_HALF_UP
),
), # M平均值
客户占比=(
"客户分类",
lambda x: (Decimal(len(x)) / customer_counts * Decimal("100")).quantize(
Decimal("0.00"), rounding=ROUND_HALF_UP
),
), # 统计各客户分类的客户占比
交易金额占比=(
"M",
lambda x: Decimal(
sum(x, Decimal("0.00")) / trade_amounts * Decimal("100")
).quantize(Decimal("0.00"), rounding=ROUND_HALF_UP),
), # 统计各客户分类的交易金额占比
)
.reset_index()
)
# 生成客户分类分布
draw.scatter(
dataframe=customer_types[["客户分类", "R", "F", "M"]],
xaxis_opts_min=475,
xaxis_opts_max=750,
file_name="客户分类分布.html",
)
# 生成客户占比
draw.pie(
dataframe=customer_types[["客户分类", "客户占比"]].sort_values(
by="客户占比", ascending=False
), # 按照客户占比降序
file_name="客户占比.html",
)
# 生成交易金额占比
draw.pie(
dataframe=customer_types[["客户分类", "交易金额占比"]].sort_values(
by="交易金额占比", ascending=False
), # 按照交易金额占比降序
file_name="交易金额占比.html",
)
report_backward = pandas.DataFrame(
data=[], columns=["客户分类", "窗口期", "客户数"]
)
for customer_type in customer_types["客户分类"]:
for month in range(1, 13):
# 窗口期从2013-01至2013-12
period = f"2013-{month:02d}"
# 窗口期起期(向前滑动十二个月,包括当月)
period_start = (
pandas.Period(value=f"2013-{month:02d}", freq="M") - 11
).start_time.date()
# 窗口期止期
period_end = pandas.Period(
value=f"2013-{month:02d}", freq="M"
).end_time.date()
# 指定客户分类窗口期内客户数
customer_counts = dataframe.loc[
(dataframe["客户分类"] == customer_type)
& (dataframe["交易日期"] >= period_start)
& (dataframe["交易日期"] <= period_end),
"客户ID",
].nunique()
report_backward.loc[report_backward.shape[0]] = [
customer_type,
period,
customer_counts,
]
# 生成近十二个自然月客户数趋势
draw.area(
dataframe=report_backward.groupby(by="窗口期", as_index=False).agg(
客户数=("客户数", "sum")
),
file_name="近十二个自然月客户数趋势.html",
yaxis_opts_min=1350,
)
report_backward = report_backward.loc[
report_backward["客户分类"].isin(
["新客户", "流失客户", "重要价值客户"]
) # 仅考虑新客户、流水客户、重要价值客户
].assign(
总客户数=lambda x: x.groupby(by="窗口期")["客户数"].transform(
"sum"
), # 统计窗口期总客户数并新增值各行
客户占比=lambda x: x.apply(
lambda y: (
Decimal(y["客户数"]) / Decimal(y["总客户数"]) * Decimal("100")
).quantize(
Decimal("0.00"), rounding=ROUND_HALF_UP
), # 运算各项使用DECIMAL以控制精度
axis="columns",
),
)
# 生成近十二个自然月客户占比趋势(仅考虑新客户、流失客户和重要价值客户)
draw.bar(
dataframe=report_backward[
["客户分类", "窗口期", "客户占比"]
], # 仅保留客户分类、窗口期和占比
file_name="近十二个自然月客户占比趋势.html",
stack=True,
)
report_forward = (
dataframe.assign(
最早交易日期=lambda dataframe: dataframe.groupby(by="客户ID")[
"交易日期"
].transform("min"),
) # 统计每位客户最早交易日期
.assign(
周期=lambda dataframe: (
dataframe["交易日期"].apply(lambda x: x.year)
- dataframe["最早交易日期"].apply(lambda x: x.year)
)
* 12
+ (
dataframe["交易日期"].apply(lambda x: x.month)
- dataframe["最早交易日期"].apply(lambda x: x.month)
)
) # 每笔交易的交易日期和对客户最早交易日期的间隔作为周期,单位为月
.assign(
周期=lambda dataframe: dataframe["周期"].apply(lambda x: f"M+{x:02d}")
) # 格式化周期
.assign(
群组=lambda dataframe: dataframe["最早交易日期"].apply(
lambda x: f"{x.year}-{x.month:02d}"
)
) # 截取最早交易日期的年月作为群组
.groupby(by=["客户分类", "群组", "周期"], as_index=False)
.agg(客户数=("客户ID", "nunique")) # COHORT-ANALYSIS群组-周期矩阵
.groupby(by=["客户分类", "周期"], as_index=False)
.agg(客户数=("客户数", "sum")) # 统计各客户分类各周期用户数
.assign(
基准客户数=lambda dataframe: dataframe.groupby(by=["客户分类"])[
"客户数"
].transform("first")
)
.assign(
留存率=lambda dataframe: dataframe.apply(
lambda x: (
Decimal(x["客户数"]) / Decimal(x["基准客户数"]) * Decimal("100")
).quantize(Decimal("0.00"), rounding=ROUND_HALF_UP),
axis="columns",
)
)
.loc[
lambda dataframe: dataframe["客户分类"].isin(
["新客户", "流失客户", "重要价值客户"]
)
& dataframe["周期"].isin(
[
"M+01",
"M+02",
"M+03",
"M+04",
"M+05",
"M+06",
"M+07",
"M+08",
"M+09",
"M+10",
"M+11",
"M+12",
]
)
]
.reset_index(drop=True)
)
# 生成近十二个自然月留存率趋势
draw.line(
dataframe=report_forward[["客户分类", "周期", "留存率"]],
file_name="近十二个自然月留存率趋势.html",
)
# 获取报告模版
template = Environment(loader=FileSystemLoader(".")).get_template("template.html")
# 渲染模版
rfm_report = template.render(
{
# 报告日期
"report_date": datetime.now().strftime("%Y-%m-%d"),
"sample_size": sample_size,
}
)
with open("rfm_report.html", "w", encoding="utf8") as file:
file.write(rfm_report)
print("已完成")

57
utils/logger.py Normal file
View File

@ -0,0 +1,57 @@
# -*- coding: utf-8 -*-
"""
基于LOGGING封装日志记录器
"""
# 加载模块
import logging
from logging.handlers import RotatingFileHandler
"""
类说明封装日志记录器
"""
class Logger(object):
def __init__(self, log_name: str):
# 日志文件名称
self.log_name = f"{log_name}.log"
# 创建日志记录器
self.logger = logging.getLogger(self.log_name)
self.logger.setLevel(logging.INFO)
# 设置日志信息格式
self.formatter = logging.Formatter(
"%(asctime)s - %(levelname)s - %(message)s", datefmt="%y-%m-%d %H:%M:%S"
)
# 控制台输出
self.stream_handle = logging.StreamHandler()
self.stream_handle.setLevel("INFO")
self.stream_handle.setFormatter(self.formatter)
# 文件输出
self.file_handle = RotatingFileHandler(
filename=self.log_name, maxBytes=5 * 1024 * 1024, encoding="utf-8"
)
self.file_handle.setLevel("INFO")
self.file_handle.setFormatter(self.formatter)
# 添加控制台和文件日志记录
if not self.logger.handlers:
self.logger.addHandler(self.stream_handle)
self.logger.addHandler(self.file_handle)
def log(self, message):
return self.logger.info(message)

986
推荐系统/main.py Normal file
View File

@ -0,0 +1,986 @@
# -*- coding: utf-8 -*-
'''
脚本说明
推荐系统召回阶段使用基于用户物品协同过滤算法生成候选物品列表精排阶段使用时序建模和多头注意力机制进一步精排最终形成推荐物品列表
'''
# 导入模块
from pydantic import BaseModel, Field, model_validator
from typing import Literal
from itertools import islice
from operator import itemgetter
import time
import numpy
from collections import deque, defaultdict
import heapq
from pyLSHash import LSHash
'''
编码区域
'''
# 数据模型:初始化参数
class InitializationArguments(BaseModel):
# 时间窗口(单位为天)
time_window: int = Field(default=30, ge=5, le=360)
# 衰减兰布达因子
decay_lambda: float = Field(default=0, ge=0.00, le=10)
# 用户特征向量维度数
attributes_dimensions: int = Field(default=10, ge=2.00, le=200)
# 数据模型:用户属性
class Attributes(BaseModel):
# 年龄,数据类型为整数
age: int = Field(default=..., ge=1, le=99)
# 性别
gender: Literal['male', 'female'] = Field(default=...)
# 市
city: str = Field(default=...)
# 职业
occupation: str = Field(default=...)
# 数据模型:设置记录
class SettingRecord(BaseModel):
# 用户,数据类型为字符,无默认值,必填,六位
user: str = Field(default=..., min_length=6, max_length=6)
# 用户属性:
attributes: Attributes = Field(default=...)
# 数据模型:用户行为记录
class BehaviorRecord(BaseModel):
# 用户标识,数据类型为字符,无默认值,必填,六位
user: str = Field(default=..., min_length=6, max_length=6)
# 时间戳,数据类型为时间戳,必填
timestamp: int = Field(default=...)
# 行为类型,数据类型为列表,指定范围
type: Literal['rating'] = Field(default=...)
# 物品标识,数据类型为字符,无默认值,必填,六位
item: str = Field(default=..., min_length=6, max_length=6)
# 评分,数据类型为整数或空默认为空大于等于1小于等于5
rating: int | None = Field(default=None, ge=1, le=5)
# 校验规则:若行为类型为评分则评分非空
@model_validator(mode='after')
def _validate_rating(self):
if self.type == 'rating' and not self.rating:
raise ValueError(
'the rating parameter is required when type=rating')
return self
# 原型级推荐系统
class PrototypeRecommender:
def __init__(self, **arguments):
# 行为参数(不同行为赋予不同的的基础兴趣分数和衰减因子权重。其中,显式行为兴趣分数根据评分转化获得)
self.behavior_arguments = {'rating': (None, 1)}
# 校验初始化参数数据模型并转为初始参数字典
arguments = InitializationArguments(**arguments).model_dump()
# 时间窗口,单位为天(将连续数据流切割为有限片段,平衡推荐系统实时性和运算效率)
self.time_window = arguments.get('time_window')
# 衰减因子兰布达系数(时间窗口内兴趣分数随时间衰减程度)
self.decay_lambda = arguments.get('decay_lambda')
# 用户特征向量维度数
self.attributes_dimensions = arguments.get('attributes_dimensions')
# 用户行为数据体(基于物品协同过滤的核心数据体)
self.behaviors = defaultdict(lambda: {
# 兴趣分数列表(时间窗口内由物品标识和兴趣分数组成的字典),为最新兴趣分数,形如{'物品标识': '兴趣分数'}
'scores': defaultdict(float),
# 时间戳堆(时间窗口内由时间戳和物品标识组成的元组),例如('时间戳', '物品标识')
'timestamps_heap': [],
# 历史物品标识列表默认最多保存200例历史物品标识平衡推荐系统实时性和运算效率形如'物品标识'
'items_history': deque(maxlen=200)
})
# 在计算物品标识-物品标识余弦相似度时可分解为分子部分和分母平方部分并在新增/更新用户行为时增量更新,以优化运算效率
# 计算物品标识-物品标识余弦相似度时分子部分
self.items_similarity_numerator = defaultdict(float)
# 计算物品标识-物品标识余弦相似度时分母平方部分
self.items_similarity_denominator_square = defaultdict(float)
# 物品标识倒排表,形如{'物品标识': ['用户标识']}
self.items_inversion = defaultdict(list)
# 用户特征向量数据体(基于用户协同过滤的核心数据体,用户属性以独热编码方式保存)
self.attributes = defaultdict(
lambda: numpy.zeros(self.attributes_dimensions))
# 基于LSHash作为用户特征向量索引器默认哈希值的二进制位数为8哈希表数为2哈希矩阵持久化路径
self.attributes_indexer = LSHash(
hash_size=8, input_dim=self.attributes_dimensions, num_hashtables=2)
# 处理用户属性记录
def process_attribute_record(self, attribute_record: dict):
# 校验设置记录数据模型
attribute_record = SettingRecord(**attribute_record).model_dump()
user = attribute_record.get('user')
for key, value in attribute_record.get('attributes').items():
# 若用户属性值非空
if value:
# 更新用户属性
self.behaviors[user]['attributes'][key] = value
return True
# 处理用户行为记录
def process_behavior_record(self, behavior_record: dict):
# 校验行为记录数据模型,数据类型转为字典
behavior_record = BehaviorRecord(**behavior_record).model_dump()
# 用户标识
user = behavior_record['user']
# 时间戳
timestamp = behavior_record['timestamp']
# 行为类型
type = behavior_record['type']
# 物品标识
item = behavior_record['item']
# 评分若行为类型为评分则评分必填评分数据类型为整数指定范围1~5
rating = behavior_record['rating']
# 整理用户行为数据
self._reorganize_behaviors(user=user)
# 原兴趣分数
score_past = self.behaviors[user]['scores'][item]
# 现兴趣分数
score = self._calculate_score(
timestamp=timestamp, type=type, rating=rating)
# 若现兴趣分数大于原兴趣分数
if score > score_past:
# 更新兴趣分数列表
self.behaviors[user]['scores'][item] = score
# 更新时间戳堆
heapq.heappush(self.behaviors[user]
['timestamps_heap'], (timestamp, item))
# 更新历史物品标识列表
self.behaviors[user]['items_history'].append(item)
# 更新计算物品标识-物品标识余弦相似度的分子和分子平方部分
self._update_items_similarity_components(
user=user, item=item, score_old=score_past, score_new=score)
# 若用户标识在物品标识倒排表中索引为物品标识的用户标识列表
if user not in self.items_inversion[item]:
# 更新倒排表
self.items_inversion[item].append(user)
# 整理用户行为数据(基于时间窗口清理过期数据)
def _reorganize_behaviors(self, user):
# 时间戳堆
timestamps_heap = self.behaviors[user]['timestamps_heap']
# 若时间戳堆非空列表且现在距时间戳堆中第一元组的时间戳的时距大于时间窗口
while timestamps_heap and (time.time() - timestamps_heap[0][0]) / 86400 > self.time_window:
# 删除时间戳堆中第一元组并获取过期时间戳和过期物品标识
timestamp_expired, item_expired = heapq.heappop(timestamps_heap)
# 若过期物品标识的兴趣分数非空
if self.behaviors[user]['scores'][item_expired]:
# 获取过期物品标识的兴趣分数
score_expired = self.behaviors[user]['scores'][item_expired]
# 在兴趣分数列表删除索引为过期物品标识的项
del self.behaviors[user]['scores'][item_expired]
# 若过期物品标识在历史物品标识列表
if item_expired in self.behaviors[user]['items_history']:
# 在历史物品标识列表删除过期物品标识
self.behaviors[user]['items_history'].remove(item_expired)
# 更新更新计算物品标识-物品标识余弦相似度的分子和分子平方部分
self._update_items_similarity_components(
user=user, item=item_expired, score_old=score_expired, score_new=0)
# 若用户标识在物品标识倒排表索引为过期物品标识的用户标识列表
if user in self.items_inversion[item_expired]:
# 在用户标识列表删除用户标识
self.items_inversion[item_expired].remove(user)
# 若物品标识倒排表中用户标识列表为空列表
if not self.items_inversion[item_expired]:
# 在物品标识倒排表删除索引为物品标识的项
del self.items_inversion[item_expired]
# 计算兴趣分数
def _calculate_score(self, timestamp, type, rating):
# 在时间窗口内,兴趣分数随时间以指数函数衰减
# 时距,单位为天
time_interval = (time.time() - timestamp) / 86400
# 若时距大于时间窗口
if time_interval > self.time_window:
return 0
# 基础兴趣分数,衰减因子权重
score_base, weight = self.behavior_arguments.get(type)
# 若行为类型为评分
if type == 'rating':
# 基础兴趣分数经非线性转化为0.2至0.8
score_base = 0.2 + 0.6 * (1 / (1 + numpy.exp(3 - rating)))
# 加权后衰减因子兰布达系数
decay_lambda_weighted = self.decay_lambda * weight
# 基于指数函数计算兴趣评分
score = score_base * \
numpy.exp(0 - decay_lambda_weighted * time_interval)
return score
# 更新计算物品标识-物品标识余弦相似度的分子和分子平方部分
def _update_items_similarity_components(self, user, item, score_old, score_new):
for item_history in self.behaviors[user]['items_history']:
if item_history != item:
# 构建物品标识-物品标识的有序物品标识对
pair = tuple(sorted((item_history, item)))
self.items_similarity_numerator[pair] += (
score_new - score_old) * self.behaviors[user]['scores'][item_history]
self.items_similarity_denominator_square[item] += score_new**2 - score_old**2
# 生成推荐列表
def generate_recommendations(self, user, k=10):
# 推荐物品标识列表
items_candidates = defaultdict(float)
# 整理用户行为数据
self._reorganize_behaviors(user=user)
# 最大候选数
maximum_candidates = 20 * k
behaviors = self.behaviors[user]
# 历史物品标识列表
items_history = behaviors['items_history']
# 基于物品协同过滤算法生成的候选物品兴趣分数权重
alpha_weight = 0.2 / \
(1 + numpy.exp(0.05 * len(items_history) - 1.2)) + 0.65
# 基于物品协同过滤算法生成候选物品标识列表
candidates_items = self._generate_items_candidates(
user=user, maximum_candidates=maximum_candidates)
# 基于用户协同过滤算法生成候选物品标识列表
candidates_users = self._generate_users_candidates(
user=user, maximum_candidates=maximum_candidates)
# 合并基于物品协同过滤算法生成的候选物品标识列表和基于用户协同过滤算法生成候的选物品标识列表
for item_candidate in candidates_items.keys() | candidates_users.keys():
items_candidates[item_candidate] = candidates_items[item_candidate] * \
alpha_weight + \
candidates_users[item_candidate] * (1 - alpha_weight)
return dict(islice(sorted(items_candidates.items(), key=itemgetter(1), reverse=True), k))
# 基于物品协同过滤算法生成候选物品标识列表
def _generate_items_candidates(self, user, maximum_candidates):
# 召回物品标识列表
items_recall = defaultdict(lambda: {'counts': 0, 'scores': 0})
behaviors = self.behaviors[user]
# 历史物品标识列表作为启发物品标识列表
items_heuristic = behaviors['items_history']
# 先通过启发式物品标识在物品标识倒排表查询索引为启发式物品标识的启发式用户标识,再通过启发式用户标识查询历史物品标识列表作为候选物品标识
for item_heuristic in items_heuristic:
for user_heuristic in self.items_inversion[item_heuristic]:
# 若通过启发式物品标识在物品标识倒排表查询索引为启发式物品标识的启发式用户标识和用户标识不一致
if user_heuristic != user:
for item_recall in self.behaviors[user_heuristic]['items_history']:
# 若召回物品标识不在启发物品标识列表
if item_recall not in items_heuristic:
items_recall[item_recall]['counts'] += 1
# 遍历启发式物品标识列表和召回物品标识列表(召回物品标识列表不可能包含启发式物品标识),计算余弦相似度
for item_heuristic in items_heuristic:
# 在物品标识倒排表查询索引为启发式物品标识的用户标识列表
users_heuristic = self.items_inversion[item_heuristic]
for item_recall in items_recall:
# 在物品标识倒排表查询索引为召回物品标识的用户标识列表
users_recall = self.items_inversion[item_recall]
# 计算物品标识-物品标识余弦相似度时分母部分
items_similarity_denominator = numpy.sqrt(
self.items_similarity_denominator_square[item_heuristic] * self.items_similarity_denominator_square[item_recall])
# 计算物品标识-物品标识余弦相似度时分母部分不为0
if items_similarity_denominator != 0:
# 构建物品标识-物品标识的有序物品标识对
pair = tuple(sorted((item_heuristic, item_recall)))
# 余弦相似度
similarity = (
self.items_similarity_numerator[pair] / items_similarity_denominator)
else:
similarity = 0
# 在物品标识倒排表查询索引为历史物品标识的用户标识列表和在物品标识倒排表查询索引为召回物品标识的用户标识列表共同用户标识列表
users_common = list(set(users_heuristic) & set(users_recall))
# 抑制流行物品因子
popularity_suppressed = len(
users_common) / numpy.sqrt(len(users_heuristic) * len(users_recall))
items_recall[item_recall]['scores'] += behaviors['scores'][item_heuristic] * \
similarity * popularity_suppressed
# 归一化候选物品标识列表
candidates = self._normalize_candidates(
items_recall=items_recall, maximum_candidates=maximum_candidates)
return candidates
# 基于用户协同过滤算法生成候选物品标识列表
def _generate_users_candidates(self, user, maximum_candidates):
# 召回物品标识列表
items_recall = defaultdict(lambda: {'counts': 0, 'scores': 0})
attributes = self.attributes[user]
# 若用户特征向量非初始化特征向量
if numpy.all(attributes != 0):
# 基于LSHash查询与用户特征向量相似的用户标识作为召回用户标识
for _, similarity, user_recall in self.attributes_indexer.query(query_vector=attributes, num_results=maximum_candidates, dist_func='cosine'):
behaviors_recall = self.behaviors[user_recall]
# 召回用户标识的历史物品标识作为召回物品标识
for item_recall in behaviors_recall['items_history']:
# 若召回物品标识不在历史物品标识列表
if item_recall not in self.behaviors[user]['items_history']:
items_recall[item_recall]['counts'] += 1
items_recall[item_recall]['scores'] += behaviors_recall['scores'][item_recall] * similarity
# 归一化候选物品标识列表
candidates = self._normalize_candidates(
items_recall=items_recall, maximum_candidates=maximum_candidates)
return candidates
# 归一化候选物品标识列表
def _normalize_candidates(self, items_recall, maximum_candidates):
# 候选物品标识列表
candidates = defaultdict(float)
# 若召回物品标识列表非空字典
if items_recall:
# 候选物品兴趣分数
scores = [nest['scores'] for nest in items_recall.values()]
# 候选物品相似分数最小值
scores_minimum = min(scores, default=0)
# 候选物品相似分数最大值
scores_maximum = max(scores, default=0)
# 值距
scores_range = scores_maximum - scores_minimum
# 若值距不为0
if scores_range != 0:
# 兴趣分数归一化
for item_recall in items_recall:
candidates[item_recall] = (
(items_recall[item_recall]['scores'] - scores_minimum) / scores_range) * 0.6 + 0.2
else:
# 兴趣分数默认为0.8
for item_recall in items_recall:
candidates[item_recall] = 0.8
# 根据兴趣分数倒序排序并截取
candidates = dict(islice(sorted(candidates.items(), key=itemgetter(
1), reverse=True), maximum_candidates))
return candidates
if __name__ == "__main__":
# 初始化引擎
recommender = PrototypeRecommender()
feedback_records = [
{'user': 'aaaaaa', 'item': '111111', 'type': 'rating',
'timestamp': int(time.time() - 3600), 'rating': 4},
{'user': 'aaaaaa', 'item': '333333', 'type': 'rating',
'timestamp': int(time.time() - 3200), 'rating': 4},
{'user': 'bbbbbb', 'item': '333333', 'type': 'rating',
'timestamp': int(time.time() - 3200), 'rating': 4},
{'user': 'cccccc', 'item': '111111', 'type': 'rating',
'timestamp': int(time.time() - 3200), 'rating': 5},
{'user': 'cccccc', 'item': '222222', 'type': 'rating',
'timestamp': int(time.time() - 3200), 'rating': 5},
{'user': 'cccccc', 'item': '333333', 'type': 'rating',
'timestamp': int(time.time() - 3200), 'rating': 3}
]
for feedback_record in feedback_records:
recommender.process_behavior_record(behavior_record=feedback_record)
a = recommender.generate_recommendations(user='cccccc')
print(a)
exit()
'''
import numpy
import pandas
import zipcodes
import re
from collections import Counter
from scipy.stats import chisquare
from scipy.stats.contingency import association
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import math
from minisom import MiniSom
from sklearn.cluster import KMeans
import warnings
#忽略警告
warnings.simplefilter('ignore')
import sys
sys.path.append('..')
from utils.pandas2chart import Pandas2chart
from utils.algorithms import OptimalClusters
#本脚本中所调用的函数
#提取性别特征时将特征值“M”映射为“male”“F”映射为“female”
def Gender(element):
match element:
case 'M':
return 'male'
case 'F':
return 'female'
case default:
return numpy.nan
#提取年龄特征时将小于18岁映射为“under18”大于等于18岁且小于等于24岁映射为“18~24”大于等于25岁且小于等于34岁映射为“25~34”大于等于35岁且小于等于44岁映射为“35~44”大于等于45岁且小于等于54岁映射为“45~54”大于54岁映射为“above54””
def Age(element):
match element:
case age if age > 54:
return 'above54'
case age if age >= 45:
return '45~54'
case age if age >= 35:
return '35~44'
case age if age >= 25:
return '25~34'
case age if age >= 18:
return '18~24'
case age if age < 18:
return 'under18'
case default:
return numpy.nan
#提取职业特征时根据映射表映射
def Occupation(element):
match element:
case 0:
return 'other'
case 1:
return 'academic/educator'
case 2:
return 'artist'
case 3:
return 'clerical/admin'
case 4:
return 'college/grad student'
case 5:
return 'customer service'
case 6:
return 'doctor/health care'
case 7:
return 'executive/managerial'
case 8:
return 'farmer'
case 9:
return 'homemaker'
case 10:
return 'k-12 student'
case 11:
return 'lawyer'
case 12:
return 'programmer'
case 13:
return 'retired'
case 14:
return 'sales/marketing'
case 15:
return 'scientist'
case 16:
return 'self-employed'
case 17:
return 'technician/engineer'
case 18:
return 'tradesman/craftsman'
case 19:
return 'unemployed'
case 20:
return 'writer'
case default:
return numpy.nan
#提取州级行政区特征时根据邮政编码模糊查询州级行政区若为空或多个则映射为“null”否则为查询结果
def State(element):
#校验邮政编码格式
if not re.match(r'^\d{4,5}$|^\d{5}-\d{4}$', element):
#若邮政编码由9位数字组成修改邮政编码格式否则为“null”
if re.match(r'^\d{9}$', element):
element = element[: 5] + '-' + element[-4: ]
else:
return numpy.nan
#根据邮政编码模糊查询并解析州级行政区
states = [element.get('state') for element in zipcodes.similar_to(element)]
#若州级行政区数为1则将查询结果定义为州级行政区否则为“null”
if len(set(states)) == 1:
return states[0]
else:
return numpy.nan
#提取最近最喜欢的电影体裁特征
def Genres(series):
#合并每位用户评价过的电影体裁并切割为单个
genres = series.str.cat(sep = '|').split('|')
#查询数量最多的电影体裁并返回
return str.lower(Counter(genres).most_common(1)[0][0])
#独热编码特征名组合器
def Combiner(feature, category):
return str(feature) + ':' + str(category)
#若本脚本被调用报错
if __name__ != '__main__':
print('本脚本不允许被调用')
print()
exit()
print('1 打开本地数据文件,读取数据集...', end = '')
try:
dataset_users = pandas.read_csv(filepath_or_buffer = './MovieLens10K/users.csv', low_memory = False)
dataset_movies = pandas.read_csv(filepath_or_buffer = './MovieLens10K/movies.csv', low_memory = False)
dataset_ratings = pandas.read_csv(filepath_or_buffer ='./MovieLens10K/ratings.csv', low_memory = False)
except:
print('读取失败,请检查数据文件是否存在或正确')
print()
exit()
print('已完成')
print()
#评分数据集根据电影标识关联电影名称和体裁
dataset_ratings = dataset_ratings.merge(right = dataset_movies[['movieId', 'title', 'genres']], how = 'left', on = 'movieId')
#统计用户数
users = dataset_users.shape[0]
print('2 构建标签体系')
print()
print('2.1 提取特征...', end = '')
dataset = pandas.DataFrame(data = dataset_users['userId'].tolist(), columns = ['userId'])
#提取性别特征
dataset['gender'] = dataset_users['gender'].map(lambda element: Gender(element))
#提取年龄特征
dataset['age'] = dataset_users['age'].map(lambda element: Age(element))
#提取职业特征
dataset['occupation'] = dataset_users['occupation'].map(lambda element: Occupation(element))
#提取州级行政区特征
#dataset['state'] = dataset_users['zip'].map(lambda element: State(element))
#就评分数据集按照userId分组统计每位用户最喜欢的体裁
pivottable_ratings = dataset_ratings.groupby(by = 'userId').agg(
#最喜欢的电影体裁
genres = pandas.NamedAgg(column = 'genres', aggfunc = Genres)
)
pivottable_ratings.reset_index(inplace = True)
#合并分箱后评分数和平均评分,另最喜欢的电影体裁
dataset = dataset.merge(right = pivottable_ratings[['userId', 'genres']], how = 'left', on = 'userId')
#删除用户ID和包含缺失值的样本
dataset.pop('userId').dropna(inplace = True)
print('已完成')
print()
#统计样本数
samples = dataset.shape[0]
#获取特征名称
independents = dataset.columns
print('特征数据集中样本数为 %d 例,特征数为 %d 个。' % (samples, len(independents)))
print()
print('2.2 检验各特征各项样本数是否符合均匀分布')
print()
for independent in independents:
#按照特征分组,统计各项样本数
pivottable = dataset.groupby(by = independent).agg(
samples = pandas.NamedAgg(column = independent, aggfunc = 'count')
)
#检验各项样本数是否均匀分布
statistic, probability = chisquare(f_obs = pivottable['samples'].to_numpy())
if probability < 0.05:
print('特征 %s 各项样本数不符合均匀分布,卡方统计量为 %.2f,概率为 %.2f' % (independent, statistic, probability))
print()
else:
print('特征 %s 各项样本数符合均匀分布,卡方统计量为 %.2f,概率为 %.2f' % (independent, statistic, probability))
print()
pivottable.reset_index(inplace = True)
#按照样本数倒序排序
pivottable.sort_values(by = 'samples', ascending = False, inplace = True)
#若项数大于指定值则将第指定值项至最后一项合并为一项指定值为6
if pivottable.shape[0] > 6:
pivottable_marging = pivottable.iloc[: 5]
#合并后的项名为others统计第指定值-1项至最后一项样本数的和
pivottable_marging.loc[pivottable_marging.shape[0]] = ['others', pivottable.iloc[5: , 1].sum()]
else:
pivottable_marging = pivottable
#生成环形图
Pandas2chart(dataset = pivottable_marging, type = 'circular', path = './reports/persona_report/circular_{}.html'.format(independent))
print('2.3 统计特征之间相关系数')
print()
#用于保存特征之间克莱姆相关系数矩阵
correlation_matrix = pandas.DataFrame(data = [], index = independents, columns = independents)
#用于保存相关特征对
correlation_pairs = []
for index, independent_index in enumerate(independents):
for column, independent_column in enumerate(independents):
#统计特征之间克莱姆相关系数
statistic = round(association(observed = pandas.crosstab(index = dataset[independent_index], columns = dataset[independent_column])), 2)
correlation_matrix.loc[independent_index, independent_column] = statistic
#获取相关特征对
if column > index and statistic >= 0.25:
correlation_pairs.append({'independent': independent_index, 'independent_correlation': independent_column})
#生成相关系数矩阵热力图
Pandas2chart(dataset = correlation_matrix, type = 'heatmap', path = './reports/persona_report/heatmap_correlation_matrix.html')
print('3、构建用户细分群体')
print()
print('3.1 独热编码特征并标准化...', end = '')
#独热编码特征,用于决策树算法模型
onehot_encoder = OneHotEncoder(sparse_output = False, handle_unknown = 'ignore', feature_name_combiner = Combiner).fit(X = dataset.to_numpy())
dataset_processing = pandas.DataFrame(data = onehot_encoder.transform(X = dataset.to_numpy()), columns = onehot_encoder.get_feature_names_out(input_features = independents)).astype(dtype = 'int')
#独热编码特征
dataset_preprocessing = OneHotEncoder(sparse_output = False, handle_unknown = 'ignore').fit_transform(X = dataset.to_numpy())
#标准化特征
dataset_preprocessing = StandardScaler().fit_transform(X = dataset_preprocessing)
print('已完成')
print()
print('3.2 基于自我组织映射算法初步聚类...', end = '')
#定义竞争层的长度和高度(经验值)
competitive_layer_length = competitive_layer_heigth = math.ceil(2.25 * math.pow(samples, 0.25))
#创建自我组织映射算法模型
som = MiniSom(x = competitive_layer_length, y = competitive_layer_heigth, input_len = dataset_preprocessing.shape[1], sigma = math.sqrt(math.pow(competitive_layer_length, 2) + math.pow(competitive_layer_heigth, 2)), activation_distance = 'cosine', random_seed = 0)
#初始化模型
som.pca_weights_init(data = dataset_preprocessing)
#训练模型
som.train_batch(data = dataset_preprocessing, num_iteration = 10)
#获取各样本的竞争层中优胜点坐标
dataset_preprocessing = [som.winner(record) for record in dataset_preprocessing]
dataset_preprocessing = pandas.DataFrame(data = dataset_preprocessing, columns = ['axis_x', 'axis_y'])
print('已完成')
print()
print('3.3 就各样本的竞争层中优胜点坐标基于K均值算法再次聚类使用间隔统计量评估聚类效果并确定最优聚类簇数...', end = '')
#创建K均值算法模型并训练
kmeans = KMeans(n_clusters = OptimalClusters(dataset_preprocessing.to_numpy()), n_init = 'auto').fit(dataset_preprocessing.to_numpy())
dataset_processing['cluster_label'] = kmeans.labels_
print('已完成')
print()
print('3.4 基于决策树拟合聚类结果并输出聚类规则...', end = '')
print(dataset_processing['cluster_label'].max())
'''

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,109 @@
# -*- coding: utf-8 -*-
'''
脚本说明将传统理赔已质检的赔案数据进行解析命否规则
'''
#导入模块
import json
import pandas
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))))
from utils.pandas_extension import SaveAsExcel
print('正在打开文件并抽取数据...', end = '')
#打开源文件并读取数据
dataset = pandas.read_csv(filepath_or_buffer = 'dataset.csv')
print('已完成')
print()
print('正在转换数据...', end = '')
#转换函数,用于将命否规则中结果返回
def transform(rules):
result = []
try:
#替换规则结果中部分字符串
rules = rules.replace('true', 'True').replace('false', 'False')
#表达化
rules = eval(rules)
#若规则结果数据类型为列表则遍历,获取命否的规则编码并拼接
if isinstance(rules, list):
#遍历规则结果
for rule in rules:
#若规则命否则将规则获取规则编码
if not rule.get('success'):
#添加命否规则编码
result.append(rule.get('ruleCode'))
except:
result.append('failure')
return ','.join(result)
#清洗命否规则
dataset['清洗后命否规则'] = dataset['命否规则'].apply(func = lambda x: transform(x))
#解析预期进入自动审核字段
dataset['预期进入自动审核'] = dataset['清洗后命否规则'].apply(func = lambda x: '' if x == '' else '')
#解析实际
dataset['自动审核通过复核'] = dataset.apply(func = lambda x: '' if x['预期进入自动审核'] == '' and x['审核员'] == '自动审核' else ('' if x['预期进入自动审核'] == '' and x['审核员'] != '自动审核' else ''), axis = 'columns')
dataset = dataset[['赔案号', '质检完成时间', '预期进入自动审核', '自动审核通过复核', '清洗后命否规则']]
#数据集透视
pivot_table = dataset.groupby(by = '质检完成时间').agg(质检完成赔案数 = pandas.NamedAgg(column = '质检完成时间', aggfunc = lambda x: pandas.notna(x).sum()), 预期进入自动审核赔案数 = pandas.NamedAgg(column = '预期进入自动审核', aggfunc = lambda x: sum(x == '')), 自动审核通过复核赔案数 = pandas.NamedAgg(column = '自动审核通过复核', aggfunc = lambda x: sum(x == '')))
#按照质检完成时间倒序排序
pivot_table.sort_values(by = '质检完成时间', ascending = False, inplace = True)
#重置索引
pivot_table.reset_index(inplace = True)
#计算预期进入自动化审核率
pivot_table['预期进入自动审核率'] = pivot_table.apply(lambda x: round(x['预期进入自动审核赔案数'] / x['质检完成赔案数'] * 100, 2) if x['质检完成赔案数'] != 0 else None, axis = 'columns')
#计算预期进入自动化审核率
pivot_table['自动审核复核通过率'] = pivot_table.apply(lambda x: round(x['自动审核通过复核赔案数'] / x['预期进入自动审核赔案数'] * 100, 2) if x['预期进入自动审核赔案数'] != 0 else None, axis = 'columns')
#添加汇总行
pivot_table.loc[pivot_table.shape[0]] = ['汇总', pivot_table['质检完成赔案数'].sum(), pivot_table['预期进入自动审核赔案数'].sum(), pivot_table['自动审核通过复核赔案数'].sum(), round(pivot_table['预期进入自动审核赔案数'].sum() / pivot_table['质检完成赔案数'].sum() * 100, 2), round(pivot_table['自动审核通过复核赔案数'].sum() / pivot_table['预期进入自动审核赔案数'].sum() * 100, 2)]
print('已完成')
print()
print('正在将数据保存为EXCEL...', end = '')
SaveAsExcel(worksheets = [['按照质检完成时间汇总', pivot_table], ['明细数据', dataset]], file_path = '普康健康_自动审核统计数据.xlsx')
print('已完成')
print()
'''
修改记录
1241220 就数据转换类型进行调整使用封装函数保存为EXCEL另外处理步骤参考抽取转换和加载
'''

View File

@ -0,0 +1,76 @@
# -*- coding: utf-8 -*-
"""
普康健康_影像处理
"""
# 加载模块
from pathlib import Path
import zipfile
from utils.pandas_extension import open_csv, save_as_workbook
if __name__ == "__main__":
print("正在本地加载待处理影像信息...", end="")
dataframe = open_csv(file_name="待处理影像信息.csv")
dataframe["处理结果"] = "待处理"
# 统计行数
rows = dataframe.shape[0]
print(f"已完成,行数为 {rows}")
print("正在处理影像件...", end="")
for index, row in dataframe.iterrows():
# 推送月份
push_month = row["推送月份"]
# 影像件名称
image_name = row["影像件名称"]
# 推送山东国网影像件名称
push_name = row["推送山东国网影像件名称"]
# 压缩包名称
zip_name = row["压缩包名称"]
# 处理结果
result = "发生其它异常"
# 创建影像件PATH对象
image = Path(f"images/{push_month}/{image_name}")
# 若影像件PATH对象不存在则处理结果为“未找到影像件”
if not image.exists():
result = "未找到影像件"
continue
# ZIP压缩
with zipfile.ZipFile(file=f"zips/{zip_name}", mode='w', compression=zipfile.ZIP_DEFLATED) as file:
# 压缩包内影像件名称为推送山东国网影像件名称
file.write(image, arcname=push_name) # 指定在压缩包内影像件名称
result = "处理成功"
dataframe.loc[index, "处理结果"] = result
print("已完成")
print("正在保存为工作簿...", end="")
save_as_workbook(worksheets=[("Sheet1", dataframe)], workbook_name="results.xlsx")
print("已完成")

File diff suppressed because it is too large Load Diff