557 lines
		
	
	
		
			18 KiB
		
	
	
	
		
			Python
		
	
	
	
			
		
		
	
	
			557 lines
		
	
	
		
			18 KiB
		
	
	
	
		
			Python
		
	
	
	
| # -*- coding: utf-8 -*-
 | ||
| 
 | ||
| print('''脚本说明:
 | ||
| 
 | ||
| 基于GiveMeSomeCredit数据集,构建贷款申请评分卡并生成建模报告
 | ||
| ''')
 | ||
| 
 | ||
| #导入包
 | ||
| 
 | ||
| import pandas
 | ||
| 
 | ||
| import numpy
 | ||
| 
 | ||
| from sklearn.tree import DecisionTreeClassifier
 | ||
| 
 | ||
| from statsmodels.stats.outliers_influence import variance_inflation_factor
 | ||
| 
 | ||
| from sklearn.model_selection import GridSearchCV
 | ||
| 
 | ||
| from sklearn.linear_model import LogisticRegression
 | ||
| 
 | ||
| from sklearn.metrics import roc_curve
 | ||
| 
 | ||
| from jinja2 import Environment, FileSystemLoader
 | ||
| 
 | ||
| import time
 | ||
| 
 | ||
| import warnings
 | ||
| 
 | ||
| #忽略警告
 | ||
| warnings.simplefilter('ignore')
 | ||
| 
 | ||
| import sys
 | ||
| 
 | ||
| sys.path.append('..')
 | ||
| 
 | ||
| from utils.mysql import MySQL
 | ||
| 
 | ||
| from utils.pandas2chart import Pandas2chart
 | ||
| 
 | ||
| #本脚本中调用函数
 | ||
| 
 | ||
| '''
 | ||
| 
 | ||
| 函数说明:基于决策树最优分箱并证据权重编码
 | ||
| 
 | ||
| 参数说明:
 | ||
| 
 | ||
| 	数据集,格式为Pandas.DataFrame,第一列为目标变量,其它为特征变量
 | ||
| 
 | ||
| 返回说明:
 | ||
| 
 | ||
| 	数据格式:Pandas.DataFrame
 | ||
| 
 | ||
| '''
 | ||
| 
 | ||
| def OptimalEncodeByWOE(dataset):
 | ||
| 
 | ||
| 	dataset_woe = dataset.copy()
 | ||
| 
 | ||
| 	#目标变量名
 | ||
| 	dependent = dataset_woe.columns[0]
 | ||
| 
 | ||
| 	#特征变量名
 | ||
| 	independents = dataset_woe.columns[1: ]
 | ||
| 
 | ||
| 	#字典,用于记录特征变量各箱证据权重
 | ||
| 	dictionary = pandas.DataFrame()
 | ||
| 
 | ||
| 	#遍历特征变量
 | ||
| 	for independent in independents:
 | ||
| 
 | ||
| 		print('正在就特征变量 %s 基于决策树最优分箱并证据权重编码...' % independent, end = '')
 | ||
| 
 | ||
| 		#按照特征变量是否包含缺失值划分目标变量
 | ||
| 		y = dataset_woe.loc[dataset_woe[independent].notna(), dependent]
 | ||
| 
 | ||
| 		#按照是否包含缺失值划分特征变量
 | ||
| 		x = dataset_woe.loc[dataset_woe[independent].notna(), independent]
 | ||
| 
 | ||
| 		#遍历分箱数,取特征变量值数和5的最小值作为最大分箱数,2作为最小分箱数
 | ||
| 		for bins in range(min(len(x.value_counts()), 5), 1, -1):
 | ||
| 
 | ||
| 			#创建决策树分类器,每箱最小样本数占比为5%
 | ||
| 			decision_tree = DecisionTreeClassifier(max_leaf_nodes = bins, min_samples_leaf = 0.05, class_weight = 'balanced').fit(x.to_numpy().reshape(-1, 1), y.to_numpy())
 | ||
| 
 | ||
| 			#切点
 | ||
| 			tangencies = []
 | ||
| 
 | ||
| 			#遍历节点
 | ||
| 			for i in range(decision_tree.tree_.node_count) :
 | ||
| 
 | ||
| 				#若决策树某节点的左子节点和右子节点不同,则将该节点作为切点
 | ||
| 				if decision_tree.tree_.children_left[i] != decision_tree.tree_.children_right[i] :
 | ||
| 
 | ||
| 					tangencies.append(decision_tree.tree_.threshold[i])
 | ||
| 
 | ||
| 			tangencies.sort()
 | ||
| 
 | ||
| 			#添加边界点
 | ||
| 			tangencies = [x.min() - 0.01] + tangencies + [x.max() + 0.01]
 | ||
| 
 | ||
| 			#特征变量分箱
 | ||
| 			dataset_woe.loc[dataset_woe[independent].notna(), independent] = pandas.cut(x = x.to_numpy(), bins = tangencies, right = False)
 | ||
| 
 | ||
| 			#按照特征变量分组
 | ||
| 			woe = dataset_woe.loc[dataset_woe[independent].notna(), [dependent, independent]].groupby(by = independent)[dependent].agg(func = [('positives', lambda x : (x == 1).sum()), ('negatives', lambda x : (x == 0).sum())])
 | ||
| 
 | ||
| 			#重置索引
 | ||
| 			woe.reset_index(inplace = True)
 | ||
| 
 | ||
| 			woe.rename(columns = {independent: 'bin'}, inplace = True)
 | ||
| 
 | ||
| 			#若特征变量包含缺失值,则将缺失值单独作为一箱
 | ||
| 			if len(dataset_woe.loc[dataset_woe[independent].isna()]) > 0:
 | ||
| 
 | ||
| 				#统计特征变量包含缺失值样本中阳性样本数和阴性样本数
 | ||
| 				woe.loc[len(woe)] = {'bin': numpy.nan, 'positives': len(dataset_woe.loc[dataset_woe[independent].isna() & (dataset_woe[dependent] == 1), independent]), 'negatives': len(dataset_woe.loc[dataset_woe[independent].isna() & (dataset_woe[dependent] == 0), independent])}
 | ||
| 
 | ||
| 			#统计样本数
 | ||
| 			woe['samples'] = woe.apply(lambda x: x['positives'] + x['negatives'], axis = 'columns')
 | ||
| 
 | ||
| 			#统计阳性样本数占比
 | ||
| 			woe['proportion_positive'] = round(woe['positives'] / woe['positives'].sum(), 2)
 | ||
| 
 | ||
| 			#统计阴性样本数占比
 | ||
| 			woe['proportion_negative'] = round(woe['negatives'] / woe['negatives'].sum(), 2)
 | ||
| 
 | ||
| 			#统计证据权重
 | ||
| 			woe['woe'] = round(numpy.log((woe['proportion_positive'] + 0.01) / (woe['proportion_negative'] + 0.01)), 2)
 | ||
| 
 | ||
| 			#统计信息价值
 | ||
| 			woe['iv'] = round((woe['proportion_positive'] - woe['proportion_negative']) * woe['woe'], 2)
 | ||
| 
 | ||
| 			#按照分箱是否包含缺失值划分
 | ||
| 			woe_notna = woe.loc[woe['bin'].notna()].reset_index(drop = True)
 | ||
| 
 | ||
| 			#单调性检验
 | ||
| 			monotonicity = [((woe_notna.loc[i, 'woe'] <= woe_notna.loc[i + 1, 'woe']) & (woe_notna.loc[i - 1, 'woe'] <= woe_notna.loc[i, 'woe'])) | ((woe_notna.loc[i, 'woe'] >= woe_notna.loc[i + 1, 'woe']) & (woe_notna.loc[i - 1, 'woe'] >= woe_notna.loc[i, 'woe'])) for i in range(1, woe_notna.shape[0] - 1)]
 | ||
| 
 | ||
| 			#若通过单调性检验,则将特征变量证据权重编码
 | ||
| 			if False not in monotonicity:
 | ||
| 
 | ||
| 				dataset_woe[independent].replace(woe['bin'].to_numpy(), woe['woe'].to_numpy(), inplace = True)
 | ||
| 
 | ||
| 				woe['independent'] = independent
 | ||
| 
 | ||
| 				dictionary = pandas.concat([dictionary, woe])
 | ||
| 
 | ||
| 				print('已完成')
 | ||
| 				print()
 | ||
| 
 | ||
| 				break
 | ||
| 
 | ||
| 	return dataset_woe, dictionary
 | ||
| 
 | ||
| #若本脚本被调用报错
 | ||
| if __name__ != '__main__':
 | ||
| 
 | ||
| 	print('本脚本不允许被调用')
 | ||
| 	print()
 | ||
| 
 | ||
| 	exit()
 | ||
| 
 | ||
| print('1、连接数据库查表并保存至数据集...', end = '')
 | ||
| 
 | ||
| dataset = MySQL(database = 'data_analysis').query('select * from credit_dataset')
 | ||
| 
 | ||
| if isinstance(dataset, str):
 | ||
| 
 | ||
| 	print('连接失败,请检查数据库连接是否正常')
 | ||
| 	print()
 | ||
| 
 | ||
| 	exit()
 | ||
| 
 | ||
| print('已完成')
 | ||
| print()
 | ||
| 
 | ||
| #目标变量名,第一列即为目标变量
 | ||
| dependent = dataset.columns[0]
 | ||
| 
 | ||
| #检查目标变量值是否为0或1
 | ||
| if not ((dataset[dependent] == 0) | (dataset[dependent] == 1)).all():
 | ||
| 
 | ||
| 	print('第一列应为目标变量且值应为0或1,脚本终止!')
 | ||
| 	print()
 | ||
| 
 | ||
| 	exit()
 | ||
| 
 | ||
| #统计样本数
 | ||
| samples = dataset.shape[0]
 | ||
| 
 | ||
| #特征变量名
 | ||
| independents = dataset.columns[1: ]
 | ||
| 
 | ||
| #统计特征变量数
 | ||
| variables_independent = len(independents)
 | ||
| 
 | ||
| print('数据集样本数为 %d 份,特征变量数为 %d 个。' % (samples, variables_independent))
 | ||
| print()
 | ||
| 
 | ||
| #考虑变量数较多转置并重命名
 | ||
| Pandas2chart(dataset = dataset.loc[1:4, :].T.reset_index().rename(columns = {'index': '变量名', 1: '样本1', 2: '样本2', 3: '样本3', 4: '样本4'}), type = 'table', path = './reports/scorecard_report/dataset_preview.html')
 | ||
| 
 | ||
| print('2、预处理')
 | ||
| print()
 | ||
| 
 | ||
| print('2.1 清洗数据...', end = '')
 | ||
| 
 | ||
| #删除目标变量包含缺失值的样本
 | ||
| dataset.dropna(subset = dependent, inplace = True)
 | ||
| 
 | ||
| #删除重复样本(仅保留第一份)
 | ||
| dataset.drop_duplicates(inplace = True)
 | ||
| 
 | ||
| print('已完成')
 | ||
| print()
 | ||
| 
 | ||
| #统计样本数
 | ||
| samples = dataset.shape[0]
 | ||
| 
 | ||
| print('处理后,数据集样本数为 %d 份。' % samples)
 | ||
| print()
 | ||
| 
 | ||
| print('2.2 处理缺失值...', end = '')
 | ||
| 
 | ||
| print('在特征变量证据权重编码时,将对缺失值单独作为一箱,本节点略过')
 | ||
| print()
 | ||
| 
 | ||
| print('2.3 处理异常值...', end = '')
 | ||
| 
 | ||
| print('在特征变量证据权重编码时,可以消除异常值的影响,本节点略过')
 | ||
| print()
 | ||
| 
 | ||
| print('2.4 特征变量最优分箱并证据权重编码')
 | ||
| print()
 | ||
| 
 | ||
| dataset_woe, dictionary = OptimalEncodeByWOE(dataset)
 | ||
| 
 | ||
| Pandas2chart(dataset = dictionary.loc[dictionary['independent'] == 'Age', ['bin', 'samples', 'woe']].rename(columns = {'bin': '分箱', 'samples': '样本数', 'woe': '证据权重'}), type = 'bar+line', path = './reports/scorecard_report/dictionary.html')
 | ||
| 
 | ||
| print('3、选择特征变量')
 | ||
| print('')
 | ||
| 
 | ||
| #统计报告
 | ||
| statistics = pandas.DataFrame(data = independents, columns = ['independent'])
 | ||
| 
 | ||
| print('3.1 基于信息价值选择特征变量...', end = '')
 | ||
| 
 | ||
| #变量特征变量
 | ||
| for independent in independents:
 | ||
| 
 | ||
| 	#统计特征变量信息价值
 | ||
| 	statistics.loc[statistics['independent'] == independent, 'iv'] = dictionary.loc[dictionary['independent'] == independent, 'iv'].sum()
 | ||
| 
 | ||
| #选择信息价值大于等于阈值的特征变量(0.1为低水平预测能力,0.3为中水平预测能力,本次建模选择0.1作为阈值)
 | ||
| statistics = statistics.loc[statistics['iv'] >= 0.1]
 | ||
| 
 | ||
| independents = statistics['independent'].tolist()
 | ||
| 
 | ||
| print('已完成')
 | ||
| print()
 | ||
| 
 | ||
| #统计特征变量数
 | ||
| variables_independent = len(independents)
 | ||
| 
 | ||
| print('处理后,特征变量数为 %d 个。' % variables_independent)
 | ||
| print()
 | ||
| 
 | ||
| print('3.2 基于后向淘汰条件选择特征变量(基于归回系数和方差扩大因子)')
 | ||
| print()
 | ||
| 
 | ||
| parameters = {
 | ||
| 
 | ||
| 	#测试
 | ||
| 	'l1_ratio': [0.5],
 | ||
| 	
 | ||
| 	#'l1_ratio': [0, 0.25, 0.5, 0.75, 1],
 | ||
| 
 | ||
| 	#测试
 | ||
| 	'C': [1.1]
 | ||
| 
 | ||
| 	#'C': [0.001, 0.01, 0.1, 1.1, 10.1, 100.1, 1000.1]
 | ||
| }
 | ||
| 
 | ||
| #创建带交叉验证的参数网格搜索模型
 | ||
| model = GridSearchCV(estimator = LogisticRegression(solver = 'saga', penalty = 'elasticnet', class_weight = 'balanced'), param_grid = parameters, scoring = 'roc_auc', refit = True)
 | ||
| 
 | ||
| while True:
 | ||
| 
 | ||
| 	model.fit(dataset_woe[independents].to_numpy(), dataset_woe[dependent].to_numpy())
 | ||
| 
 | ||
| 	#统计回归系数
 | ||
| 	statistics['coefficient'] = model.best_estimator_.coef_[0, :]
 | ||
| 
 | ||
| 	#统计方差扩大因子
 | ||
| 	statistics['vif'] = [variance_inflation_factor(dataset_woe[independents].assign(constant = 1).to_numpy(), i) for i in range(len(independents) + 1)][: -1]
 | ||
| 
 | ||
| 	#按照方差扩大因子降序排序
 | ||
| 	statistics.sort_values(by = 'vif', ascending = False, inplace = True)
 | ||
| 
 | ||
| 	independents = statistics['independent'].tolist()
 | ||
| 
 | ||
| 	#统计回归系数大于等于0.1且方差扩大因子小于等于10的特征变量
 | ||
| 	statistics = statistics.loc[(statistics['coefficient'] >= 0.1) & (statistics['vif'] <= 10)]
 | ||
| 
 | ||
| 	#淘汰特征变量
 | ||
| 	obsolescence = [independent for independent in independents if independent not in statistics['independent'].tolist()]
 | ||
| 
 | ||
| 	if obsolescence != []:
 | ||
| 
 | ||
| 		#淘汰最大方差扩大因子的特征变量
 | ||
| 		independents.remove(obsolescence[0])
 | ||
| 
 | ||
| 		print('特征变量 %s 满足淘汰条件,继续后进' % obsolescence[0])
 | ||
| 		print('')
 | ||
| 
 | ||
| 	else:
 | ||
| 
 | ||
| 		print('所有特征变量不满足淘汰条件,停止后进')
 | ||
| 		print('')
 | ||
| 
 | ||
| 		break
 | ||
| 
 | ||
| #统计特征变量数
 | ||
| variables_independent = len(independents)
 | ||
| 
 | ||
| print('处理后,特征变量数为 %d 个。' % variables_independent)
 | ||
| print()
 | ||
| 
 | ||
| #统计假阳率和真阳率
 | ||
| fpr, tpr, thresholds = roc_curve(y_true = dataset_woe[dependent].to_numpy(), y_score = model.predict_proba(dataset_woe[independents].to_numpy())[:, 1])
 | ||
| 
 | ||
| #统计洛伦兹统计量
 | ||
| ks = max(tpr - fpr)
 | ||
| 
 | ||
| print('基于选择后的特征变量构建逻辑回归模型,洛伦兹统计量(KS)为 %.2f 。(~0.2不建议使用,0.2~0.4模型区分能力较好,0.4~0.5良好,0.5~0.6很好,0.6~0.75非常好,0.75~ 区别能力存疑)' % ks)
 | ||
| print()
 | ||
| 
 | ||
| Pandas2chart(dataset = statistics.loc[:, ['independent', 'iv', 'vif', 'coefficient']].rename(columns = {'independent': '特征变量名', 'iv': '信息价值', 'vif': '方差扩大因子', 'coefficient': '回归系数'}), type = 'table', path = './reports/scorecard_report/statistics.html')
 | ||
| 
 | ||
| print('4、编制评分卡')
 | ||
| print('')
 | ||
| 
 | ||
| print('4.1 基于构建后的逻辑回归模型编制评分卡...', end = '')
 | ||
| 
 | ||
| dictionary = dictionary.loc[dictionary['independent'].isin(independents), ['independent', 'bin', 'woe']].reset_index(drop = True)
 | ||
| 
 | ||
| #评分公式为S=A+BlnOdd。若优势率为1时,评分为500;若优势率为2时,评分减少50
 | ||
| 
 | ||
| #评分公式系数alpha
 | ||
| alpha = 500
 | ||
| 
 | ||
| #评分公式系数beta
 | ||
| beta = -50 / numpy.log(2)
 | ||
| 
 | ||
| #统计基础分数(先将逻辑回归模型常数项按照评分公式分数化,再按照回归系数分摊至各特征变量)
 | ||
| gamma = (alpha + beta * model.best_estimator_.intercept_[0]) / statistics['coefficient'].sum()
 | ||
| 
 | ||
| #遍历特征变量
 | ||
| for independent in independents:
 | ||
| 
 | ||
| 	coefficient = statistics.loc[statistics['independent'] == independent, 'coefficient'].iat[0]
 | ||
| 
 | ||
| 	#统计特征变量的加权基础分数
 | ||
| 	dictionary.loc[dictionary['independent'] == independent, 'gamma'] = gamma * coefficient
 | ||
| 
 | ||
| 	#统计特征变量的加权回归系数
 | ||
| 	dictionary.loc[dictionary['independent'] == independent, 'beta'] = beta * coefficient
 | ||
| 
 | ||
| 	#先将回归常量按照回归系数分摊至各特征变量,再统计各箱分数
 | ||
| 	dictionary.loc[dictionary['independent'] == independent, 'score'] = dictionary.loc[dictionary['independent'] == independent, ['woe', 'gamma', 'beta']].apply(lambda x: round(x['gamma'] + x['beta'] * x['woe']), axis = 'columns')
 | ||
| 
 | ||
| 	dataset_woe[independent].replace(dictionary.loc[dictionary['independent'] == independent, 'woe'].to_numpy(), dictionary.loc[dictionary['independent'] == independent, 'score'].to_numpy(), inplace = True)
 | ||
| 
 | ||
| #统计总分数
 | ||
| dataset_woe['score'] = dataset_woe[independents].apply(lambda x: x.sum(), axis = 'columns')
 | ||
| 
 | ||
| print('已完成')
 | ||
| print()
 | ||
| 
 | ||
| Pandas2chart(dataset = dictionary.loc[dictionary['independent'] == 'Age', ['bin', 'woe', 'gamma', 'beta', 'score']].rename(columns = {'bin': '分箱', 'woe': '证据权重', 'gamma': '加权基础分数', 'beta': '加权回归系数', 'score': '分数'}), type = 'table', path = './reports/scorecard_report/dictionary_score.html')
 | ||
| 
 | ||
| #总分数等距分箱
 | ||
| dataset_woe['bin'] = pandas.cut(x = dataset_woe['score'].to_numpy(), bins = [0, 350, 400, 450, 500, 550, 600, 650, 1000], right = False)
 | ||
| 
 | ||
| #按照特征变量分组
 | ||
| score = dataset_woe.groupby(by = 'bin').agg(
 | ||
| 
 | ||
| 	#阳性样本数
 | ||
| 	positives = (dependent, lambda x: (x == 1).sum()),
 | ||
| 
 | ||
| 	#阴性样本数
 | ||
| 	negatives = (dependent, lambda x: (x == 0).sum()),
 | ||
| 
 | ||
| 	#样本数
 | ||
| 	samples = (dependent, lambda x: x.count())
 | ||
| 
 | ||
| )
 | ||
| 
 | ||
| #重置索引
 | ||
| score.reset_index(inplace = True)
 | ||
| 
 | ||
| #审批拒绝
 | ||
| score['threshold'] = score['bin'].apply(lambda x: '<{}'.format(x.right))
 | ||
| 
 | ||
| #统计各箱阳性样本数就各箱样本数占比
 | ||
| score['proportion'] = round(score['positives'] / score['samples'] * 100, 2)
 | ||
| 
 | ||
| #统计各箱样本数就总样本数占比
 | ||
| score['proportion_sample'] = score['samples'] / score['samples'].sum() * 100
 | ||
| 
 | ||
| #累计求和各箱样本数就总样本数占比
 | ||
| score['accumulation_sample'] = round(score['proportion_sample'].cumsum(), 2)
 | ||
| 
 | ||
| #累计求和各箱阳性样本数
 | ||
| score['accumulation_positives'] = score['positives'].cumsum()
 | ||
| 
 | ||
| #累计求和各箱样本数
 | ||
| score['accumulation_samples'] = score['samples'].cumsum()
 | ||
| 
 | ||
| #统计各箱累计求和阳性样本数就累计求和样本数占比
 | ||
| score['proportion_positives'] = round(score['accumulation_positives'] / score['accumulation_samples'] * 100, 2)
 | ||
| 
 | ||
| #统计各箱阳性样本数就总阳性样本数占比
 | ||
| score['proportion_positive'] = score['positives'] / score['positives'].sum() * 100
 | ||
| 
 | ||
| #累计求和各箱阳性样本数就总阳性样本数占比
 | ||
| score['accumulation_positive'] = round(score['proportion_positive'].cumsum(), 2)
 | ||
| 
 | ||
| #统计各箱阴样本数就总阴性样本数占比
 | ||
| score['proportion_negative'] = score['negatives'] / score['negatives'].sum() * 100
 | ||
| 
 | ||
| #累计求和各箱阴性样本数就总阴性样本数占比
 | ||
| score['accumulation_negative'] = round(score['proportion_negative'].cumsum(), 2)
 | ||
| 
 | ||
| #统计各箱柯斯统计量
 | ||
| score['ks'] = round(abs(score['accumulation_positive'] - score['accumulation_negative']), 2)
 | ||
| 
 | ||
| #统计评分卡柯斯统计量
 | ||
| ks = score['ks'].max()
 | ||
| 
 | ||
| #统计各箱提升统计量
 | ||
| score['lift'] = round((score['accumulation_positive'] + 0.01) / (score['accumulation_sample'] + 0.01), 2)
 | ||
| 
 | ||
| #统计评分卡提升统计量
 | ||
| lift = score['lift'].max()
 | ||
| 
 | ||
| print('基于构建后的逻辑回归模型编制评分卡,柯斯统计量(KS)为 %.2f ,提升统计量(LIFT)为 %.2f 。' % (ks, lift))
 | ||
| print()
 | ||
| 
 | ||
| Pandas2chart(dataset = score[['bin', 'ks', 'lift']].rename(columns = {'bin': '分箱', 'ks': '柯斯统计量', 'lift': '提升统计量'}), type = 'line+line', path = './reports/scorecard_report/model_evaluation.html')
 | ||
| 
 | ||
| Pandas2chart(dataset = score[['bin', 'threshold', 'proportion', 'accumulation_sample', 'proportion_positives', 'accumulation_positive']].rename(columns = {'bin': '分箱', 'threshold': '拒绝规则', 'proportion': '分箱逾期率', 'accumulation_sample': '拒绝率', 'proportion_positives': '拒绝逾期率', 'accumulation_positive': '累计逾期率'}), type = 'table', path = './reports/scorecard_report/business_evaluation.html')
 | ||
| 
 | ||
| print('4.2 生成评分卡规则文件并保存...', end = '')
 | ||
| 
 | ||
| calculate = '''def Calculate(sample):\n\n\tscore = 0\n\n'''
 | ||
| 
 | ||
| #遍历特征变量
 | ||
| for independent in independents:
 | ||
| 
 | ||
| 	calculate = calculate + '\tmatch sample["{}"]:\n\n'.format(independent)
 | ||
| 
 | ||
| 	subset = dictionary.loc[dictionary['independent'] == independent].reset_index(drop = True)
 | ||
| 
 | ||
| 	#若倒数第一个分箱为缺失值则倒数第二个分箱开放右边界、倒数第一个分箱就缺失值赋分
 | ||
| 	if subset.loc[subset.index[-1], 'bin'] is numpy.nan:
 | ||
| 
 | ||
| 		for index in subset.index:
 | ||
| 
 | ||
| 			#正数第一个分箱
 | ||
| 			if index == subset.index[0]:
 | ||
| 
 | ||
| 				calculate += '\t\tcase x if x < {}: score += {}\n\n'.format(subset.loc[index, 'bin'].right, subset.loc[index, 'score'])
 | ||
| 
 | ||
| 			#倒数第二个分箱
 | ||
| 			elif index == subset.index[-2]:
 | ||
| 
 | ||
| 				calculate += '\t\tcase x if x >= {}: score += {}\n\n'.format(subset.loc[index, 'bin'].left, subset.loc[index, 'score'])
 | ||
| 
 | ||
| 			#倒数第一个分箱
 | ||
| 			elif index == subset.index[-1]:
 | ||
| 
 | ||
| 				calculate += '\t\tcase numpy.nan: score += {}\n\n'.format(subset.loc[index, 'score'])
 | ||
| 
 | ||
| 			else:
 | ||
| 
 | ||
| 				calculate += '\t\tcase x if x < {} and x >= {}: score += {}\n\n'.format(subset.loc[index, 'bin'].right, subset.loc[index, 'bin'].left, subset.loc[index, 'score'])
 | ||
| 
 | ||
| 	else:
 | ||
| 
 | ||
| 		for index in subset.index:
 | ||
| 
 | ||
| 			#正数第一个分箱
 | ||
| 			if index == subset.index[0]:
 | ||
| 
 | ||
| 				calculate += '\t\tcase x if x < {}: score += {}\n\n'.format(subset.loc[index, 'bin'].right, subset.loc[index, 'score'])
 | ||
| 
 | ||
| 			#倒数第一个分箱
 | ||
| 			elif index == subset.index[-1]:
 | ||
| 
 | ||
| 				calculate += '\t\tcase x if x >= {}: score += {}\n\n'.format(subset.loc[index, 'bin'].left, subset.loc[index, 'score'])
 | ||
| 
 | ||
| 			else:
 | ||
| 
 | ||
| 				calculate += '\t\tcase x if x < {} and x >= {}: score += {}\n\n'.format(subset.loc[index, 'bin'].right, subset.loc[index, 'bin'].left, subset.loc[index, 'score'])
 | ||
| 
 | ||
| calculate += '\treturn score'
 | ||
| 
 | ||
| #将评分卡规则写入本地文件
 | ||
| with open('../utils/scorecrad_calculate.txt', 'w') as file:
 | ||
| 
 | ||
| 	file.write(calculate)
 | ||
| 
 | ||
| print('已完成')
 | ||
| print()
 | ||
| 
 | ||
| print('5、生成贷款申请评分卡报告...', end = '')
 | ||
| 
 | ||
| #选择报告模版
 | ||
| template = Environment(loader = FileSystemLoader('./reports/scorecard_report/')).get_template('template.html')
 | ||
| 
 | ||
| #渲染
 | ||
| scorecard_report = template.render(
 | ||
| 
 | ||
| 	{
 | ||
| 
 | ||
| 		#报告日期
 | ||
| 		'report_date': time.strftime('%y-%m-%d', time.localtime()),
 | ||
| 
 | ||
| 		'samples': samples,
 | ||
| 
 | ||
| 		'variables_independent': variables_independent,
 | ||
| 
 | ||
| 		'ks': ks,
 | ||
| 
 | ||
| 		'lift': lift
 | ||
| 
 | ||
| 	}
 | ||
| 
 | ||
| )
 | ||
| 
 | ||
| with open('./reports/scorecard_report/scorecard_report.html', 'w', encoding = 'utf8') as file:
 | ||
| 
 | ||
| 	file.write(scorecard_report)
 | ||
| 
 | ||
| print('已完成')
 | ||
| print()
 | ||
| 
 | ||
| 
 | ||
| 
 | ||
| 
 | ||
| 
 | ||
| 
 | ||
| 
 |