557 lines
18 KiB
Python
557 lines
18 KiB
Python
# -*- coding: utf-8 -*-
|
||
|
||
print('''脚本说明:
|
||
|
||
基于GiveMeSomeCredit数据集,构建贷款申请评分卡并生成建模报告
|
||
''')
|
||
|
||
#导入包
|
||
|
||
import pandas
|
||
|
||
import numpy
|
||
|
||
from sklearn.tree import DecisionTreeClassifier
|
||
|
||
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
||
|
||
from sklearn.model_selection import GridSearchCV
|
||
|
||
from sklearn.linear_model import LogisticRegression
|
||
|
||
from sklearn.metrics import roc_curve
|
||
|
||
from jinja2 import Environment, FileSystemLoader
|
||
|
||
import time
|
||
|
||
import warnings
|
||
|
||
#忽略警告
|
||
warnings.simplefilter('ignore')
|
||
|
||
import sys
|
||
|
||
sys.path.append('..')
|
||
|
||
from utils.mysql import MySQL
|
||
|
||
from utils.pandas2chart import Pandas2chart
|
||
|
||
#本脚本中调用函数
|
||
|
||
'''
|
||
|
||
函数说明:基于决策树最优分箱并证据权重编码
|
||
|
||
参数说明:
|
||
|
||
数据集,格式为Pandas.DataFrame,第一列为目标变量,其它为特征变量
|
||
|
||
返回说明:
|
||
|
||
数据格式:Pandas.DataFrame
|
||
|
||
'''
|
||
|
||
def OptimalEncodeByWOE(dataset):
|
||
|
||
dataset_woe = dataset.copy()
|
||
|
||
#目标变量名
|
||
dependent = dataset_woe.columns[0]
|
||
|
||
#特征变量名
|
||
independents = dataset_woe.columns[1: ]
|
||
|
||
#字典,用于记录特征变量各箱证据权重
|
||
dictionary = pandas.DataFrame()
|
||
|
||
#遍历特征变量
|
||
for independent in independents:
|
||
|
||
print('正在就特征变量 %s 基于决策树最优分箱并证据权重编码...' % independent, end = '')
|
||
|
||
#按照特征变量是否包含缺失值划分目标变量
|
||
y = dataset_woe.loc[dataset_woe[independent].notna(), dependent]
|
||
|
||
#按照是否包含缺失值划分特征变量
|
||
x = dataset_woe.loc[dataset_woe[independent].notna(), independent]
|
||
|
||
#遍历分箱数,取特征变量值数和5的最小值作为最大分箱数,2作为最小分箱数
|
||
for bins in range(min(len(x.value_counts()), 5), 1, -1):
|
||
|
||
#创建决策树分类器,每箱最小样本数占比为5%
|
||
decision_tree = DecisionTreeClassifier(max_leaf_nodes = bins, min_samples_leaf = 0.05, class_weight = 'balanced').fit(x.to_numpy().reshape(-1, 1), y.to_numpy())
|
||
|
||
#切点
|
||
tangencies = []
|
||
|
||
#遍历节点
|
||
for i in range(decision_tree.tree_.node_count) :
|
||
|
||
#若决策树某节点的左子节点和右子节点不同,则将该节点作为切点
|
||
if decision_tree.tree_.children_left[i] != decision_tree.tree_.children_right[i] :
|
||
|
||
tangencies.append(decision_tree.tree_.threshold[i])
|
||
|
||
tangencies.sort()
|
||
|
||
#添加边界点
|
||
tangencies = [x.min() - 0.01] + tangencies + [x.max() + 0.01]
|
||
|
||
#特征变量分箱
|
||
dataset_woe.loc[dataset_woe[independent].notna(), independent] = pandas.cut(x = x.to_numpy(), bins = tangencies, right = False)
|
||
|
||
#按照特征变量分组
|
||
woe = dataset_woe.loc[dataset_woe[independent].notna(), [dependent, independent]].groupby(by = independent)[dependent].agg(func = [('positives', lambda x : (x == 1).sum()), ('negatives', lambda x : (x == 0).sum())])
|
||
|
||
#重置索引
|
||
woe.reset_index(inplace = True)
|
||
|
||
woe.rename(columns = {independent: 'bin'}, inplace = True)
|
||
|
||
#若特征变量包含缺失值,则将缺失值单独作为一箱
|
||
if len(dataset_woe.loc[dataset_woe[independent].isna()]) > 0:
|
||
|
||
#统计特征变量包含缺失值样本中阳性样本数和阴性样本数
|
||
woe.loc[len(woe)] = {'bin': numpy.nan, 'positives': len(dataset_woe.loc[dataset_woe[independent].isna() & (dataset_woe[dependent] == 1), independent]), 'negatives': len(dataset_woe.loc[dataset_woe[independent].isna() & (dataset_woe[dependent] == 0), independent])}
|
||
|
||
#统计样本数
|
||
woe['samples'] = woe.apply(lambda x: x['positives'] + x['negatives'], axis = 'columns')
|
||
|
||
#统计阳性样本数占比
|
||
woe['proportion_positive'] = round(woe['positives'] / woe['positives'].sum(), 2)
|
||
|
||
#统计阴性样本数占比
|
||
woe['proportion_negative'] = round(woe['negatives'] / woe['negatives'].sum(), 2)
|
||
|
||
#统计证据权重
|
||
woe['woe'] = round(numpy.log((woe['proportion_positive'] + 0.01) / (woe['proportion_negative'] + 0.01)), 2)
|
||
|
||
#统计信息价值
|
||
woe['iv'] = round((woe['proportion_positive'] - woe['proportion_negative']) * woe['woe'], 2)
|
||
|
||
#按照分箱是否包含缺失值划分
|
||
woe_notna = woe.loc[woe['bin'].notna()].reset_index(drop = True)
|
||
|
||
#单调性检验
|
||
monotonicity = [((woe_notna.loc[i, 'woe'] <= woe_notna.loc[i + 1, 'woe']) & (woe_notna.loc[i - 1, 'woe'] <= woe_notna.loc[i, 'woe'])) | ((woe_notna.loc[i, 'woe'] >= woe_notna.loc[i + 1, 'woe']) & (woe_notna.loc[i - 1, 'woe'] >= woe_notna.loc[i, 'woe'])) for i in range(1, woe_notna.shape[0] - 1)]
|
||
|
||
#若通过单调性检验,则将特征变量证据权重编码
|
||
if False not in monotonicity:
|
||
|
||
dataset_woe[independent].replace(woe['bin'].to_numpy(), woe['woe'].to_numpy(), inplace = True)
|
||
|
||
woe['independent'] = independent
|
||
|
||
dictionary = pandas.concat([dictionary, woe])
|
||
|
||
print('已完成')
|
||
print()
|
||
|
||
break
|
||
|
||
return dataset_woe, dictionary
|
||
|
||
#若本脚本被调用报错
|
||
if __name__ != '__main__':
|
||
|
||
print('本脚本不允许被调用')
|
||
print()
|
||
|
||
exit()
|
||
|
||
print('1、连接数据库查表并保存至数据集...', end = '')
|
||
|
||
dataset = MySQL(database = 'data_analysis').query('select * from credit_dataset')
|
||
|
||
if isinstance(dataset, str):
|
||
|
||
print('连接失败,请检查数据库连接是否正常')
|
||
print()
|
||
|
||
exit()
|
||
|
||
print('已完成')
|
||
print()
|
||
|
||
#目标变量名,第一列即为目标变量
|
||
dependent = dataset.columns[0]
|
||
|
||
#检查目标变量值是否为0或1
|
||
if not ((dataset[dependent] == 0) | (dataset[dependent] == 1)).all():
|
||
|
||
print('第一列应为目标变量且值应为0或1,脚本终止!')
|
||
print()
|
||
|
||
exit()
|
||
|
||
#统计样本数
|
||
samples = dataset.shape[0]
|
||
|
||
#特征变量名
|
||
independents = dataset.columns[1: ]
|
||
|
||
#统计特征变量数
|
||
variables_independent = len(independents)
|
||
|
||
print('数据集样本数为 %d 份,特征变量数为 %d 个。' % (samples, variables_independent))
|
||
print()
|
||
|
||
#考虑变量数较多转置并重命名
|
||
Pandas2chart(dataset = dataset.loc[1:4, :].T.reset_index().rename(columns = {'index': '变量名', 1: '样本1', 2: '样本2', 3: '样本3', 4: '样本4'}), type = 'table', path = './reports/scorecard_report/dataset_preview.html')
|
||
|
||
print('2、预处理')
|
||
print()
|
||
|
||
print('2.1 清洗数据...', end = '')
|
||
|
||
#删除目标变量包含缺失值的样本
|
||
dataset.dropna(subset = dependent, inplace = True)
|
||
|
||
#删除重复样本(仅保留第一份)
|
||
dataset.drop_duplicates(inplace = True)
|
||
|
||
print('已完成')
|
||
print()
|
||
|
||
#统计样本数
|
||
samples = dataset.shape[0]
|
||
|
||
print('处理后,数据集样本数为 %d 份。' % samples)
|
||
print()
|
||
|
||
print('2.2 处理缺失值...', end = '')
|
||
|
||
print('在特征变量证据权重编码时,将对缺失值单独作为一箱,本节点略过')
|
||
print()
|
||
|
||
print('2.3 处理异常值...', end = '')
|
||
|
||
print('在特征变量证据权重编码时,可以消除异常值的影响,本节点略过')
|
||
print()
|
||
|
||
print('2.4 特征变量最优分箱并证据权重编码')
|
||
print()
|
||
|
||
dataset_woe, dictionary = OptimalEncodeByWOE(dataset)
|
||
|
||
Pandas2chart(dataset = dictionary.loc[dictionary['independent'] == 'Age', ['bin', 'samples', 'woe']].rename(columns = {'bin': '分箱', 'samples': '样本数', 'woe': '证据权重'}), type = 'bar+line', path = './reports/scorecard_report/dictionary.html')
|
||
|
||
print('3、选择特征变量')
|
||
print('')
|
||
|
||
#统计报告
|
||
statistics = pandas.DataFrame(data = independents, columns = ['independent'])
|
||
|
||
print('3.1 基于信息价值选择特征变量...', end = '')
|
||
|
||
#变量特征变量
|
||
for independent in independents:
|
||
|
||
#统计特征变量信息价值
|
||
statistics.loc[statistics['independent'] == independent, 'iv'] = dictionary.loc[dictionary['independent'] == independent, 'iv'].sum()
|
||
|
||
#选择信息价值大于等于阈值的特征变量(0.1为低水平预测能力,0.3为中水平预测能力,本次建模选择0.1作为阈值)
|
||
statistics = statistics.loc[statistics['iv'] >= 0.1]
|
||
|
||
independents = statistics['independent'].tolist()
|
||
|
||
print('已完成')
|
||
print()
|
||
|
||
#统计特征变量数
|
||
variables_independent = len(independents)
|
||
|
||
print('处理后,特征变量数为 %d 个。' % variables_independent)
|
||
print()
|
||
|
||
print('3.2 基于后向淘汰条件选择特征变量(基于归回系数和方差扩大因子)')
|
||
print()
|
||
|
||
parameters = {
|
||
|
||
#测试
|
||
'l1_ratio': [0.5],
|
||
|
||
#'l1_ratio': [0, 0.25, 0.5, 0.75, 1],
|
||
|
||
#测试
|
||
'C': [1.1]
|
||
|
||
#'C': [0.001, 0.01, 0.1, 1.1, 10.1, 100.1, 1000.1]
|
||
}
|
||
|
||
#创建带交叉验证的参数网格搜索模型
|
||
model = GridSearchCV(estimator = LogisticRegression(solver = 'saga', penalty = 'elasticnet', class_weight = 'balanced'), param_grid = parameters, scoring = 'roc_auc', refit = True)
|
||
|
||
while True:
|
||
|
||
model.fit(dataset_woe[independents].to_numpy(), dataset_woe[dependent].to_numpy())
|
||
|
||
#统计回归系数
|
||
statistics['coefficient'] = model.best_estimator_.coef_[0, :]
|
||
|
||
#统计方差扩大因子
|
||
statistics['vif'] = [variance_inflation_factor(dataset_woe[independents].assign(constant = 1).to_numpy(), i) for i in range(len(independents) + 1)][: -1]
|
||
|
||
#按照方差扩大因子降序排序
|
||
statistics.sort_values(by = 'vif', ascending = False, inplace = True)
|
||
|
||
independents = statistics['independent'].tolist()
|
||
|
||
#统计回归系数大于等于0.1且方差扩大因子小于等于10的特征变量
|
||
statistics = statistics.loc[(statistics['coefficient'] >= 0.1) & (statistics['vif'] <= 10)]
|
||
|
||
#淘汰特征变量
|
||
obsolescence = [independent for independent in independents if independent not in statistics['independent'].tolist()]
|
||
|
||
if obsolescence != []:
|
||
|
||
#淘汰最大方差扩大因子的特征变量
|
||
independents.remove(obsolescence[0])
|
||
|
||
print('特征变量 %s 满足淘汰条件,继续后进' % obsolescence[0])
|
||
print('')
|
||
|
||
else:
|
||
|
||
print('所有特征变量不满足淘汰条件,停止后进')
|
||
print('')
|
||
|
||
break
|
||
|
||
#统计特征变量数
|
||
variables_independent = len(independents)
|
||
|
||
print('处理后,特征变量数为 %d 个。' % variables_independent)
|
||
print()
|
||
|
||
#统计假阳率和真阳率
|
||
fpr, tpr, thresholds = roc_curve(y_true = dataset_woe[dependent].to_numpy(), y_score = model.predict_proba(dataset_woe[independents].to_numpy())[:, 1])
|
||
|
||
#统计洛伦兹统计量
|
||
ks = max(tpr - fpr)
|
||
|
||
print('基于选择后的特征变量构建逻辑回归模型,洛伦兹统计量(KS)为 %.2f 。(~0.2不建议使用,0.2~0.4模型区分能力较好,0.4~0.5良好,0.5~0.6很好,0.6~0.75非常好,0.75~ 区别能力存疑)' % ks)
|
||
print()
|
||
|
||
Pandas2chart(dataset = statistics.loc[:, ['independent', 'iv', 'vif', 'coefficient']].rename(columns = {'independent': '特征变量名', 'iv': '信息价值', 'vif': '方差扩大因子', 'coefficient': '回归系数'}), type = 'table', path = './reports/scorecard_report/statistics.html')
|
||
|
||
print('4、编制评分卡')
|
||
print('')
|
||
|
||
print('4.1 基于构建后的逻辑回归模型编制评分卡...', end = '')
|
||
|
||
dictionary = dictionary.loc[dictionary['independent'].isin(independents), ['independent', 'bin', 'woe']].reset_index(drop = True)
|
||
|
||
#评分公式为S=A+BlnOdd。若优势率为1时,评分为500;若优势率为2时,评分减少50
|
||
|
||
#评分公式系数alpha
|
||
alpha = 500
|
||
|
||
#评分公式系数beta
|
||
beta = -50 / numpy.log(2)
|
||
|
||
#统计基础分数(先将逻辑回归模型常数项按照评分公式分数化,再按照回归系数分摊至各特征变量)
|
||
gamma = (alpha + beta * model.best_estimator_.intercept_[0]) / statistics['coefficient'].sum()
|
||
|
||
#遍历特征变量
|
||
for independent in independents:
|
||
|
||
coefficient = statistics.loc[statistics['independent'] == independent, 'coefficient'].iat[0]
|
||
|
||
#统计特征变量的加权基础分数
|
||
dictionary.loc[dictionary['independent'] == independent, 'gamma'] = gamma * coefficient
|
||
|
||
#统计特征变量的加权回归系数
|
||
dictionary.loc[dictionary['independent'] == independent, 'beta'] = beta * coefficient
|
||
|
||
#先将回归常量按照回归系数分摊至各特征变量,再统计各箱分数
|
||
dictionary.loc[dictionary['independent'] == independent, 'score'] = dictionary.loc[dictionary['independent'] == independent, ['woe', 'gamma', 'beta']].apply(lambda x: round(x['gamma'] + x['beta'] * x['woe']), axis = 'columns')
|
||
|
||
dataset_woe[independent].replace(dictionary.loc[dictionary['independent'] == independent, 'woe'].to_numpy(), dictionary.loc[dictionary['independent'] == independent, 'score'].to_numpy(), inplace = True)
|
||
|
||
#统计总分数
|
||
dataset_woe['score'] = dataset_woe[independents].apply(lambda x: x.sum(), axis = 'columns')
|
||
|
||
print('已完成')
|
||
print()
|
||
|
||
Pandas2chart(dataset = dictionary.loc[dictionary['independent'] == 'Age', ['bin', 'woe', 'gamma', 'beta', 'score']].rename(columns = {'bin': '分箱', 'woe': '证据权重', 'gamma': '加权基础分数', 'beta': '加权回归系数', 'score': '分数'}), type = 'table', path = './reports/scorecard_report/dictionary_score.html')
|
||
|
||
#总分数等距分箱
|
||
dataset_woe['bin'] = pandas.cut(x = dataset_woe['score'].to_numpy(), bins = [0, 350, 400, 450, 500, 550, 600, 650, 1000], right = False)
|
||
|
||
#按照特征变量分组
|
||
score = dataset_woe.groupby(by = 'bin').agg(
|
||
|
||
#阳性样本数
|
||
positives = (dependent, lambda x: (x == 1).sum()),
|
||
|
||
#阴性样本数
|
||
negatives = (dependent, lambda x: (x == 0).sum()),
|
||
|
||
#样本数
|
||
samples = (dependent, lambda x: x.count())
|
||
|
||
)
|
||
|
||
#重置索引
|
||
score.reset_index(inplace = True)
|
||
|
||
#审批拒绝
|
||
score['threshold'] = score['bin'].apply(lambda x: '<{}'.format(x.right))
|
||
|
||
#统计各箱阳性样本数就各箱样本数占比
|
||
score['proportion'] = round(score['positives'] / score['samples'] * 100, 2)
|
||
|
||
#统计各箱样本数就总样本数占比
|
||
score['proportion_sample'] = score['samples'] / score['samples'].sum() * 100
|
||
|
||
#累计求和各箱样本数就总样本数占比
|
||
score['accumulation_sample'] = round(score['proportion_sample'].cumsum(), 2)
|
||
|
||
#累计求和各箱阳性样本数
|
||
score['accumulation_positives'] = score['positives'].cumsum()
|
||
|
||
#累计求和各箱样本数
|
||
score['accumulation_samples'] = score['samples'].cumsum()
|
||
|
||
#统计各箱累计求和阳性样本数就累计求和样本数占比
|
||
score['proportion_positives'] = round(score['accumulation_positives'] / score['accumulation_samples'] * 100, 2)
|
||
|
||
#统计各箱阳性样本数就总阳性样本数占比
|
||
score['proportion_positive'] = score['positives'] / score['positives'].sum() * 100
|
||
|
||
#累计求和各箱阳性样本数就总阳性样本数占比
|
||
score['accumulation_positive'] = round(score['proportion_positive'].cumsum(), 2)
|
||
|
||
#统计各箱阴样本数就总阴性样本数占比
|
||
score['proportion_negative'] = score['negatives'] / score['negatives'].sum() * 100
|
||
|
||
#累计求和各箱阴性样本数就总阴性样本数占比
|
||
score['accumulation_negative'] = round(score['proportion_negative'].cumsum(), 2)
|
||
|
||
#统计各箱柯斯统计量
|
||
score['ks'] = round(abs(score['accumulation_positive'] - score['accumulation_negative']), 2)
|
||
|
||
#统计评分卡柯斯统计量
|
||
ks = score['ks'].max()
|
||
|
||
#统计各箱提升统计量
|
||
score['lift'] = round((score['accumulation_positive'] + 0.01) / (score['accumulation_sample'] + 0.01), 2)
|
||
|
||
#统计评分卡提升统计量
|
||
lift = score['lift'].max()
|
||
|
||
print('基于构建后的逻辑回归模型编制评分卡,柯斯统计量(KS)为 %.2f ,提升统计量(LIFT)为 %.2f 。' % (ks, lift))
|
||
print()
|
||
|
||
Pandas2chart(dataset = score[['bin', 'ks', 'lift']].rename(columns = {'bin': '分箱', 'ks': '柯斯统计量', 'lift': '提升统计量'}), type = 'line+line', path = './reports/scorecard_report/model_evaluation.html')
|
||
|
||
Pandas2chart(dataset = score[['bin', 'threshold', 'proportion', 'accumulation_sample', 'proportion_positives', 'accumulation_positive']].rename(columns = {'bin': '分箱', 'threshold': '拒绝规则', 'proportion': '分箱逾期率', 'accumulation_sample': '拒绝率', 'proportion_positives': '拒绝逾期率', 'accumulation_positive': '累计逾期率'}), type = 'table', path = './reports/scorecard_report/business_evaluation.html')
|
||
|
||
print('4.2 生成评分卡规则文件并保存...', end = '')
|
||
|
||
calculate = '''def Calculate(sample):\n\n\tscore = 0\n\n'''
|
||
|
||
#遍历特征变量
|
||
for independent in independents:
|
||
|
||
calculate = calculate + '\tmatch sample["{}"]:\n\n'.format(independent)
|
||
|
||
subset = dictionary.loc[dictionary['independent'] == independent].reset_index(drop = True)
|
||
|
||
#若倒数第一个分箱为缺失值则倒数第二个分箱开放右边界、倒数第一个分箱就缺失值赋分
|
||
if subset.loc[subset.index[-1], 'bin'] is numpy.nan:
|
||
|
||
for index in subset.index:
|
||
|
||
#正数第一个分箱
|
||
if index == subset.index[0]:
|
||
|
||
calculate += '\t\tcase x if x < {}: score += {}\n\n'.format(subset.loc[index, 'bin'].right, subset.loc[index, 'score'])
|
||
|
||
#倒数第二个分箱
|
||
elif index == subset.index[-2]:
|
||
|
||
calculate += '\t\tcase x if x >= {}: score += {}\n\n'.format(subset.loc[index, 'bin'].left, subset.loc[index, 'score'])
|
||
|
||
#倒数第一个分箱
|
||
elif index == subset.index[-1]:
|
||
|
||
calculate += '\t\tcase numpy.nan: score += {}\n\n'.format(subset.loc[index, 'score'])
|
||
|
||
else:
|
||
|
||
calculate += '\t\tcase x if x < {} and x >= {}: score += {}\n\n'.format(subset.loc[index, 'bin'].right, subset.loc[index, 'bin'].left, subset.loc[index, 'score'])
|
||
|
||
else:
|
||
|
||
for index in subset.index:
|
||
|
||
#正数第一个分箱
|
||
if index == subset.index[0]:
|
||
|
||
calculate += '\t\tcase x if x < {}: score += {}\n\n'.format(subset.loc[index, 'bin'].right, subset.loc[index, 'score'])
|
||
|
||
#倒数第一个分箱
|
||
elif index == subset.index[-1]:
|
||
|
||
calculate += '\t\tcase x if x >= {}: score += {}\n\n'.format(subset.loc[index, 'bin'].left, subset.loc[index, 'score'])
|
||
|
||
else:
|
||
|
||
calculate += '\t\tcase x if x < {} and x >= {}: score += {}\n\n'.format(subset.loc[index, 'bin'].right, subset.loc[index, 'bin'].left, subset.loc[index, 'score'])
|
||
|
||
calculate += '\treturn score'
|
||
|
||
#将评分卡规则写入本地文件
|
||
with open('../utils/scorecrad_calculate.txt', 'w') as file:
|
||
|
||
file.write(calculate)
|
||
|
||
print('已完成')
|
||
print()
|
||
|
||
print('5、生成贷款申请评分卡报告...', end = '')
|
||
|
||
#选择报告模版
|
||
template = Environment(loader = FileSystemLoader('./reports/scorecard_report/')).get_template('template.html')
|
||
|
||
#渲染
|
||
scorecard_report = template.render(
|
||
|
||
{
|
||
|
||
#报告日期
|
||
'report_date': time.strftime('%y-%m-%d', time.localtime()),
|
||
|
||
'samples': samples,
|
||
|
||
'variables_independent': variables_independent,
|
||
|
||
'ks': ks,
|
||
|
||
'lift': lift
|
||
|
||
}
|
||
|
||
)
|
||
|
||
with open('./reports/scorecard_report/scorecard_report.html', 'w', encoding = 'utf8') as file:
|
||
|
||
file.write(scorecard_report)
|
||
|
||
print('已完成')
|
||
print()
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|