\[ y = \frac{1}{1 + e^{-\theta}} \]python
\[ \theta = WX + B \]app
\[ sigmoid(x) = \frac{1}{1 + e^{-x}} \]echarts
\[ \delta sigmoid(x) = \delta{\frac{1}{1 + e^{-x}}} = \delta{\frac{e^{-x}}{(1 + e^{-x})^2}} = \delta{\frac{1}{1 + e^{-x}} * \frac{e^{-x}}{1 + e^{-x}}} = sigmoid(x) * \frac{1 + e^{-x} - 1}{1 + e^{-x}} = sigmoid(x) * (1 - sigmoid(x)) \]dom
信息熵:
\(-PlogP\)(P是機率, 小於1, 取反以後就是正數了), 這個值表明的是信息量, 若是值越大表明對當前狀況越不肯定, 信息不足.ide
\[ loss = -\sum{{y_t}log{y_p} + (1 - y_t)log{(1 - y_p)}} \]函數
\(y_t\): 真實的Y值, 須要進行獨熱編碼學習
\(y_p\): 預測的Y值ui
\[ \frac{\delta loss}{\delta Y_p} = -\frac{\delta Y_tlogY_p}{\delta Y_p} = \sum_n^N{-\frac{Y_i}{P_i} + \frac{1 - Y_i}{1 - P_i}} \]this
混淆矩陣
編碼
T\Pre | Positive | Negative |
---|---|---|
Positive | TP | FN |
Negative | FP | TN |
召回率計算
\[ recall = \frac{TP}{TP + FP} \]
精準率計算
\[ precision = \frac{TP}{TP + FN} \]
import pandas as pd from sklearn.metrics import roc_auc_score,roc_curve,auc from sklearn.model_selection import train_test_split from sklearn import metrics from sklearn.linear_model import LogisticRegression import numpy as np import random import math
data = pd.read_csv('Acard.txt') data.head()
obs_mth | bad_ind | uid | td_score | jxl_score | mj_score | rh_score | zzc_score | zcx_score | person_info | finance_info | credit_info | act_info | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2018-10-31 | 0.0 | A10000005 | 0.675349 | 0.144072 | 0.186899 | 0.483640 | 0.928328 | 0.369644 | -0.322581 | 0.023810 | 0.00 | 0.217949 |
1 | 2018-07-31 | 0.0 | A1000002 | 0.825269 | 0.398688 | 0.139396 | 0.843725 | 0.605194 | 0.406122 | -0.128677 | 0.023810 | 0.00 | 0.423077 |
2 | 2018-09-30 | 0.0 | A1000011 | 0.315406 | 0.629745 | 0.535854 | 0.197392 | 0.614416 | 0.320731 | 0.062660 | 0.023810 | 0.10 | 0.448718 |
3 | 2018-07-31 | 0.0 | A10000481 | 0.002386 | 0.609360 | 0.366081 | 0.342243 | 0.870006 | 0.288692 | 0.078853 | 0.071429 | 0.05 | 0.179487 |
4 | 2018-07-31 | 0.0 | A1000069 | 0.406310 | 0.405352 | 0.783015 | 0.563953 | 0.715454 | 0.512554 | -0.261014 | 0.023810 | 0.00 | 0.423077 |
# 看一下月份分佈,用最後一個月作爲跨時間驗證集合 data.obs_mth.unique()
array(['2018-10-31', '2018-07-31', '2018-09-30', '2018-06-30', '2018-11-30'], dtype=object)
# 劃分訓練集和驗證集 train = data[data.obs_mth != '2018-11-30'].reset_index().copy() val = data[data.obs_mth == '2018-11-30'].reset_index().copy()
# 這是咱們所有的變量,info結尾的是本身作的無監督系統輸出的我的表現,score結尾的是收費的外部徵信數據 feature_lst = ['person_info','finance_info','credit_info','act_info','td_score','jxl_score','mj_score','rh_score']
x = train[feature_lst] y = train['bad_ind'] val_x = val[feature_lst] val_y = val['bad_ind'] lr_model = LogisticRegression(C=0.1) lr_model.fit(x,y)
E:\Anaconda3\envs\sklearn\lib\site-packages\sklearn\linear_model\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning. FutureWarning) LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=None, penalty='l2', random_state=None, solver='warn', tol=0.0001, verbose=0, warm_start=False)
描繪的是不一樣的截斷點時,並以FPR和TPR爲橫縱座標軸,描述隨着截斷點的變小,TPR隨着FPR的變化。
縱軸:TPR=正例分對的機率 = TP/(TP+FN),其實就是查全率
橫軸:FPR=負例分錯的機率 = FP/(FP+TN)
做圖步驟:
根據學習器的預測結果(注意,是正例的機率值,非0/1變量)對樣本進行排序(從大到小)-----這就是截斷點依次選取的順序
按順序選取截斷點,並計算TPR和FPR---也能夠只選取n個截斷點,分別在1/n,2/n,3/n等位置
鏈接全部的點(TPR,FPR)即爲ROC圖
做圖步驟:
根據學習器的預測結果(注意,是正例的機率值,非0/1變量)對樣本進行排序(從大到小)-----這就是截斷點依次選取的順序
按順序選取截斷點,並計算TPR和FPR ---也能夠只選取n個截斷點,分別在1/n,2/n,3/n等位置
橫軸爲樣本的佔比百分比(最大100%),縱軸分別爲TPR和FPR,能夠獲得KS曲線
TPR和FPR曲線分隔最開的位置就是最好的」截斷點「,最大間隔距離就是KS值,一般>0.2便可認爲模型有比較好偶的預測準確性
# 訓練集KS y_pred = lr_model.predict_proba(x)[:,1] fpr_lr_train,tpr_lr_train,_ = roc_curve(y,y_pred) train_ks = abs(fpr_lr_train - tpr_lr_train).max() print('train_ks : ',train_ks) # 驗證集KS y_pred = lr_model.predict_proba(val_x)[:,1] fpr_lr,tpr_lr,_ = roc_curve(val_y,y_pred) val_ks = abs(fpr_lr - tpr_lr).max() print('val_ks : ',val_ks) # 訓練集AUC train_auc = auc(fpr_lr_train, tpr_lr_train) print('train_auc:', train_auc) # 驗證集AUC val_auc = auc(fpr_lr, tpr_lr) print('val_auc:', val_auc) # ROC曲線 from matplotlib import pyplot as plt plt.plot(fpr_lr_train,tpr_lr_train,label = 'train LR') plt.plot(fpr_lr,tpr_lr,label = 'evl LR') plt.plot([0,1],[0,1],'k--') # 虛線 plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC Curve') plt.legend(loc = 'best') plt.show()
train_ks : 0.4144413866157737 val_ks : 0.3709405449809594 train_auc: 0.777913749438214 val_auc: 0.749188849417094
# 使用方差膨脹係數作特徵篩選 from statsmodels.stats.outliers_influence import variance_inflation_factor X = np.array(x) for i in range(X.shape[1]): print(variance_inflation_factor(X,i))
1.3021397545577766 1.9579535743187253 1.289944208916368 2.9681708673324034 3.2871099722760153 3.286493284008913 3.3175087980337827 3.2910065791107597
方差膨脹係數VIF越大,說明自變量之間存在共線性的可能性越大。通常來說,若是方差膨脹因子超過10,則迴歸模型存在嚴重的多重共線性。又根據Hair(1995)的共線性診斷標準,當自變量的容忍度大於0.1,方差膨脹係數小於10的範圍是能夠接受的,代表白變量之間沒有共線性問題存在。
import lightgbm as lgb from sklearn.model_selection import train_test_split train_x,test_x,train_y,test_y = train_test_split(x,y,random_state=0,test_size=0.2) def lgb_test(train_x,train_y,test_x,test_y): clf =lgb.LGBMClassifier(boosting_type = 'gbdt', objective = 'binary', metric = 'auc', learning_rate = 0.1, n_estimators = 24, max_depth = 5, num_leaves = 20, max_bin = 45, min_data_in_leaf = 6, bagging_fraction = 0.6, bagging_freq = 0, feature_fraction = 0.8, ) clf.fit(train_x,train_y,eval_set = [(train_x,train_y),(test_x,test_y)],eval_metric = 'auc') return clf,clf.best_score_['valid_1']['auc'], lgb_model , lgb_auc = lgb_test(train_x,train_y,test_x,test_y) feature_importance = pd.DataFrame({'name':lgb_model.booster_.feature_name(), 'importance':lgb_model.feature_importances_}).sort_values(by=['importance'],ascending=False) feature_importance
[1] training's auc: 0.759467 valid_1's auc: 0.753322 [2] training's auc: 0.809023 valid_1's auc: 0.805658 [3] training's auc: 0.809328 valid_1's auc: 0.803858 [4] training's auc: 0.810298 valid_1's auc: 0.801355 [5] training's auc: 0.814873 valid_1's auc: 0.807356 [6] training's auc: 0.816492 valid_1's auc: 0.809279 [7] training's auc: 0.820213 valid_1's auc: 0.809208 [8] training's auc: 0.823931 valid_1's auc: 0.812081 [9] training's auc: 0.82696 valid_1's auc: 0.81453 [10] training's auc: 0.827882 valid_1's auc: 0.813428 [11] training's auc: 0.828881 valid_1's auc: 0.814226 [12] training's auc: 0.829577 valid_1's auc: 0.813749 [13] training's auc: 0.830406 valid_1's auc: 0.813156 [14] training's auc: 0.830843 valid_1's auc: 0.812973 [15] training's auc: 0.831587 valid_1's auc: 0.813501 [16] training's auc: 0.831898 valid_1's auc: 0.813611 [17] training's auc: 0.833751 valid_1's auc: 0.81393 [18] training's auc: 0.834139 valid_1's auc: 0.814532 [19] training's auc: 0.835177 valid_1's auc: 0.815209 [20] training's auc: 0.837368 valid_1's auc: 0.815205 [21] training's auc: 0.837946 valid_1's auc: 0.815099 [22] training's auc: 0.839585 valid_1's auc: 0.815602 [23] training's auc: 0.840781 valid_1's auc: 0.816105 [24] training's auc: 0.841174 valid_1's auc: 0.816869
name | importance | |
---|---|---|
2 | credit_info | 98 |
3 | act_info | 62 |
4 | td_score | 54 |
5 | jxl_score | 50 |
7 | rh_score | 50 |
0 | person_info | 49 |
1 | finance_info | 47 |
6 | mj_score | 46 |
feature_lst = ['person_info','finance_info','credit_info','act_info'] # 訓練集 x = train[feature_lst] y = train['bad_ind'] # 驗證集 val_x = val[feature_lst] val_y = val['bad_ind'] lr_model = LogisticRegression(C=0.1, class_weight='balanced') lr_model.fit(x,y) y_pred = lr_model.predict_proba(x)[:,1] fpr_lr_train,tpr_lr_train,_ = roc_curve(y,y_pred) train_ks = abs(fpr_lr_train - tpr_lr_train).max() print('train_ks : ',train_ks) y_pred = lr_model.predict_proba(val_x)[:,1] fpr_lr,tpr_lr,_ = roc_curve(val_y,y_pred) val_ks = abs(fpr_lr - tpr_lr).max() print('val_ks : ',val_ks) from matplotlib import pyplot as plt plt.plot(fpr_lr_train,tpr_lr_train,label = 'train LR') plt.plot(fpr_lr,tpr_lr,label = 'evl LR') plt.plot([0,1],[0,1],'k--') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC Curve') plt.legend(loc = 'best') plt.show()
E:\Anaconda3\envs\sklearn\lib\site-packages\sklearn\linear_model\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning. FutureWarning) train_ks : 0.4482453222991063 val_ks : 0.4198642457760936
# 係數 print('變量名單:',feature_lst) print('係數:',lr_model.coef_) print('截距:',lr_model.intercept_)
變量名單: ['person_info', 'finance_info', 'credit_info', 'act_info', 'td_score', 'jxl_score', 'mj_score', 'rh_score'] 係數: [[ 2.49234624 4.35064917 1.83110927 -1.63300006 -0.18766591 -0.11007296 -0.20273074 -0.08850626]] 截距: [-3.53854743]
# 生成報告 # 模型 model = lr_model row_num, col_num = 0, 0 # 分箱的數量 bins = 20 # 預測的機率 Y_predict = [s[1] for s in model.predict_proba(val_x)] # 標籤 Y = val_y # 樣本個數 nrows = Y.shape[0] # 列表 lis = [(Y_predict[i], Y[i]) for i in range(nrows)] # 按機率大小排序 ks_lis = sorted(lis, key=lambda x: x[0], reverse=True) # 每一箱中的樣本數量 bin_num = int(nrows/bins+1) # 負樣本數量 bad = sum([1 for (p, y) in ks_lis if y > 0.5]) # 正樣本數量 good = sum([1 for (p, y) in ks_lis if y <= 0.5]) bad_cnt, good_cnt = 0, 0 KS = [] BAD = [] GOOD = [] BAD_CNT = [] GOOD_CNT = [] BAD_PCTG = [] BADRATE = [] dct_report = {} for j in range(bins): # 每一箱中的list ds = ks_lis[j*bin_num: min((j+1)*bin_num, nrows)] # 每一箱中的負樣本數量 bad1 = sum([1 for (p, y) in ds if y > 0.5]) # 每一箱中的正樣本數量 good1 = sum([1 for (p, y) in ds if y <= 0.5]) bad_cnt += bad1 good_cnt += good1 # 負樣本率 bad_pctg = round(bad_cnt/sum(val_y),3) # bad_rate badrate = round(bad1/(bad1+good1),3) ks = round(math.fabs((bad_cnt / bad) - (good_cnt / good)),3) KS.append(ks) BAD.append(bad1) GOOD.append(good1) BAD_CNT.append(bad_cnt) GOOD_CNT.append(good_cnt) BAD_PCTG.append(bad_pctg) BADRATE.append(badrate) dct_report['KS'] = KS dct_report['BAD'] = BAD dct_report['GOOD'] = GOOD dct_report['BAD_CNT'] = BAD_CNT dct_report['GOOD_CNT'] = GOOD_CNT dct_report['BAD_PCTG'] = BAD_PCTG dct_report['BADRATE'] = BADRATE val_repot = pd.DataFrame(dct_report) val_repot
KS | BAD | GOOD | BAD_CNT | GOOD_CNT | BAD_PCTG | BADRATE | |
---|---|---|---|---|---|---|---|
0 | 0.164 | 69 | 730 | 69 | 730 | 0.210 | 0.086 |
1 | 0.271 | 51 | 748 | 120 | 1478 | 0.366 | 0.064 |
2 | 0.292 | 23 | 776 | 143 | 2254 | 0.436 | 0.029 |
3 | 0.356 | 37 | 762 | 180 | 3016 | 0.549 | 0.046 |
4 | 0.364 | 19 | 780 | 199 | 3796 | 0.607 | 0.024 |
5 | 0.363 | 16 | 783 | 215 | 4579 | 0.655 | 0.020 |
6 | 0.355 | 14 | 785 | 229 | 5364 | 0.698 | 0.018 |
7 | 0.363 | 19 | 780 | 248 | 6144 | 0.756 | 0.024 |
8 | 0.353 | 13 | 786 | 261 | 6930 | 0.796 | 0.016 |
9 | 0.336 | 11 | 788 | 272 | 7718 | 0.829 | 0.014 |
10 | 0.313 | 9 | 790 | 281 | 8508 | 0.857 | 0.011 |
11 | 0.277 | 5 | 794 | 286 | 9302 | 0.872 | 0.006 |
12 | 0.251 | 8 | 791 | 294 | 10093 | 0.896 | 0.010 |
13 | 0.234 | 11 | 788 | 305 | 10881 | 0.930 | 0.014 |
14 | 0.202 | 6 | 793 | 311 | 11674 | 0.948 | 0.008 |
15 | 0.173 | 7 | 792 | 318 | 12466 | 0.970 | 0.009 |
16 | 0.134 | 4 | 795 | 322 | 13261 | 0.982 | 0.005 |
17 | 0.092 | 3 | 796 | 325 | 14057 | 0.991 | 0.004 |
18 | 0.045 | 1 | 798 | 326 | 14855 | 0.994 | 0.001 |
19 | 0.000 | 2 | 792 | 328 | 15647 | 1.000 | 0.003 |
from pyecharts.charts import * from pyecharts import options as opts from pylab import * mpl.rcParams['font.sans-serif'] = ['SimHei'] np.set_printoptions(suppress=True) pd.set_option('display.unicode.ambiguous_as_wide', True) pd.set_option('display.unicode.east_asian_width', True) line = ( Line() .add_xaxis(list(val_repot.index)) .add_yaxis( "分組壞人佔比", list(val_repot.BADRATE), yaxis_index=0, color="red", ) .set_global_opts( title_opts=opts.TitleOpts(title="行爲評分卡模型表現"), ) .extend_axis( yaxis=opts.AxisOpts( name="累計壞人佔比", type_="value", min_=0, max_=0.5, position="right", axisline_opts=opts.AxisLineOpts( linestyle_opts=opts.LineStyleOpts(color="red") ), axislabel_opts=opts.LabelOpts(formatter="{value}"), ) ) .add_xaxis(list(val_repot.index)) .add_yaxis( "KS", list(val_repot['KS']), yaxis_index=1, color="blue", label_opts=opts.LabelOpts(is_show=False), ) ) line.render_notebook()
# ['person_info','finance_info','credit_info','act_info'] # 算分數onekey def score(person_info, finance_info, credit_info, act_info): xbeta = person_info * ( 3.49460978) + finance_info * ( 11.40051582 ) + credit_info * (2.45541981) + act_info * ( -1.68676079) --0.34484897 score = 650-34* (xbeta)/math.log(2) return score val['score'] = val.apply(lambda x : score(x.person_info,x.finance_info,x.credit_info,x.act_info) ,axis=1) fpr_lr,tpr_lr,_ = roc_curve(val_y,val['score']) val_ks = abs(fpr_lr - tpr_lr).max() print('val_ks : ',val_ks) print(val['score'].head()) # 對應評級區間 def level(score): level = 0 if score <= 600: level = "D" elif score <= 640 and score > 600 : level = "C" elif score <= 680 and score > 640: level = "B" elif score > 680 : level = "A" return level val['level'] = val.score.map(lambda x : level(x) ) val.level.groupby(val.level).count()/len(val)
val_ks : 0.4198642457760936 0 514.314551 1 636.487029 2 643.092121 3 668.413494 4 636.487029 Name: score, dtype: float64 level A 0.144351 B 0.240188 C 0.391299 D 0.224163 Name: level, dtype: float64
import seaborn as sns sns.distplot(val.score,kde=True) val = val.sort_values('score',ascending=True).reset_index(drop=True) df2=val.bad_ind.groupby(val['level']).sum() df3=val.bad_ind.groupby(val['level']).count() print(df2/df3)
E:\Anaconda3\envs\sklearn\lib\site-packages\scipy\stats\stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result. return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval level A 0.002168 B 0.008079 C 0.014878 D 0.055571 Name: bad_ind, dtype: float64