python: 模型的統計信息

In [57]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from statsmodels.graphics.mosaicplot import mosaic

%pylab inline
mpl.rcParams['font.sans-serif'] = ['SimHei']  # 指定默認字體
mpl.rcParams['axes.unicode_minus'] = False  # 解決保存圖像是負號 '-' 顯示爲方塊的問題
Populating the interactive namespace from numpy and matplotlib

載入和分析數據

In [4]:
def readData(path):
    """
    使用pandas讀取數據
    """
    data = pd.read_csv(path)
    cols = ["age", "education_num", "capital_gain", "capital_loss", "hours_per_week", "label"]
    return data[cols]

data = readData('dataset/adult.data')
In [5]:
data.hist(rwidth=0.9, grid=False, figsize=(8, 8), alpha=0.6)   # 各個變量的直方圖
plt.show()
In [6]:
data.head()
Out[6]:
age education_num capital_gain capital_loss hours_per_week label
0 39 13 2174 0 40 <=50K
1 50 13 0 0 13 <=50K
2 38 9 0 0 40 <=50K
3 53 7 0 0 40 <=50K
4 28 13 0 0 40 <=50K
In [8]:
data['label_code'] = pd.Categorical(data.label).codes   # 將文字變量轉化爲數字變量
In [10]:
data[['label', 'label_code']].head(8)
Out[10]:
label label_code
0 <=50K 0
1 <=50K 0
2 <=50K 0
3 <=50K 0
4 <=50K 0
5 <=50K 0
6 <=50K 0
7 >50K 1
In [18]:
def visualData(df):
    """
    畫直方圖,直觀瞭解數據
    """
    df.hist(
        rwidth=0.9, grid=False, figsize=(8, 8), alpha=0.6, color="grey")
    plt.show(block=False)
In [19]:
df = data[["age", "education_num", "capital_gain", "capital_loss", "hours_per_week", "label_code"]]
visualData(df)
In [20]:
data.describe()  # 僅僅顯示數值型變量的基本統計信息,若要顯示所有變量的統計信息只需傳入參數 `include='all'`
Out[20]:
age education_num capital_gain capital_loss hours_per_week label_code
count 32561.000000 32561.000000 32561.000000 32561.000000 32561.000000 32561.000000
mean 38.581647 10.080679 1077.648844 87.303830 40.437456 0.240810
std 13.640433 2.572720 7385.292085 402.960219 12.347429 0.427581
min 17.000000 1.000000 0.000000 0.000000 1.000000 0.000000
25% 28.000000 9.000000 0.000000 0.000000 40.000000 0.000000
50% 37.000000 10.000000 0.000000 0.000000 40.000000 0.000000
75% 48.000000 12.000000 0.000000 0.000000 45.000000 0.000000
max 90.000000 16.000000 99999.000000 4356.000000 99.000000 1.000000

交叉報表

交叉報表用來描述兩個變量之間的關係。javascript

  • 按四分位數劃分爲 $4$ 個區間
In [23]:
qeducation_num = pd.qcut(data['education_num'], [.25, .5, .75, 1])  
qeducation_num
Out[23]:
0         (12.0, 16.0]
1         (12.0, 16.0]
2        (8.999, 10.0]
3                  NaN
4         (12.0, 16.0]
5         (12.0, 16.0]
6                  NaN
7        (8.999, 10.0]
8         (12.0, 16.0]
9         (12.0, 16.0]
10       (8.999, 10.0]
11        (12.0, 16.0]
12        (12.0, 16.0]
13        (10.0, 12.0]
14        (10.0, 12.0]
15                 NaN
16       (8.999, 10.0]
17       (8.999, 10.0]
18                 NaN
19        (12.0, 16.0]
20        (12.0, 16.0]
21       (8.999, 10.0]
22                 NaN
23                 NaN
24       (8.999, 10.0]
25        (12.0, 16.0]
26       (8.999, 10.0]
27       (8.999, 10.0]
28       (8.999, 10.0]
29       (8.999, 10.0]
             ...      
32531     (12.0, 16.0]
32532     (12.0, 16.0]
32533     (12.0, 16.0]
32534    (8.999, 10.0]
32535              NaN
32536     (12.0, 16.0]
32537    (8.999, 10.0]
32538     (12.0, 16.0]
32539     (12.0, 16.0]
32540    (8.999, 10.0]
32541    (8.999, 10.0]
32542    (8.999, 10.0]
32543     (10.0, 12.0]
32544     (12.0, 16.0]
32545     (10.0, 12.0]
32546     (10.0, 12.0]
32547    (8.999, 10.0]
32548     (12.0, 16.0]
32549    (8.999, 10.0]
32550    (8.999, 10.0]
32551              NaN
32552     (10.0, 12.0]
32553     (12.0, 16.0]
32554     (12.0, 16.0]
32555    (8.999, 10.0]
32556     (10.0, 12.0]
32557    (8.999, 10.0]
32558    (8.999, 10.0]
32559    (8.999, 10.0]
32560    (8.999, 10.0]
Name: education_num, Length: 32561, dtype: category
Categories (3, interval[float64]): [(8.999, 10.0] < (10.0, 12.0] < (12.0, 16.0]]
  • 計算 education_numlabel 的交叉報表
In [48]:
cross1 = pd.crosstab(data['label'], qeducation_num)
cross1
Out[48]:
education_num (8.999, 10.0] (10.0, 12.0] (12.0, 16.0]
label
<=50K 14730 1823 4158
>50K 3062 626 3909

cross1 圖表化css

In [65]:
props = lambda key: {"color": "0.45"} if ' >50K' in key else {"color": "#C6E2FF"}
mosaic(cross1.stack(), gap=0.05,properties=props, title='label 和 education_num 的交叉報表')
plt.show()

搭建模型,並訓練模型

statsmodels 建模時能夠使用相似文字的表達式:html

formula 定義了模型的形式, ~ 至關於等號.html5

In [67]:
def trainModel(data):
    """
    搭建邏輯迴歸模型,並訓練模型
    """
    formula = "label_code ~ age + education_num + capital_gain + capital_loss + hours_per_week"
    model = sm.Logit.from_formula(formula, data=data)
    re = model.fit()
    return re
In [70]:
def modelSummary(re):
    """
    分析邏輯迴歸模型的統計性質
    """
    # 總體統計分析結果
    print(re.summary())
    # 用f test檢驗education_num的係數是否顯著
    print("檢驗假設education_num的係數等於0:")
    print(re.f_test("education_num=0"))
    # 用f test檢驗兩個假設是否同時成立
    print("檢驗假設education_num的係數等於0.32和hours_per_week的係數等於0.04同時成立:")
    print(re.f_test("education_num=0.32, hours_per_week=0.04"))

def interpretModel(re):
    """
    理解模型結果

    參數
    ----
    re :BinaryResults,訓練好的邏輯迴歸模型
    """
    conf = re.conf_int()
    conf['OR'] = re.params
    # 計算各個變量對事件發生比的影響
    # conf裏面的三列,分別對應着估計值的下界、上界和估計值自己
    conf.columns = ['2.5%', '97.5%', 'OR']
    print("各個變量對事件發生比的影響:")
    print(np.exp(conf))
    # 計算各個變量的邊際效應
    print("各個變量的邊際效應:")
    print(re.get_margeff(at="overall").summary())


def makePrediction(re, testSet, alpha=0.5):
    """
    使用訓練好的模型對測試數據作預測
    """
    # 關閉pandas有關chain_assignment的警告
    pd.options.mode.chained_assignment = None
    # 計算事件發生的機率
    testSet["prob"] = re.predict(testSet)
    print("事件發生機率(預測機率)大於0.6的數據個數:")
    print(testSet[testSet["prob"] > 0.6].shape[0])  # 輸出值爲576
    print("事件發生機率(預測機率)大於0.5的數據個數:")
    print(testSet[testSet["prob"] > 0.5].shape[0])  # 輸出值爲834
    # 根據預測的機率,得出最終的預測
    testSet["pred"] = testSet.apply(lambda x: 1 if x["prob"] > alpha else 0, axis=1)
    return testSet


def evaluation(re):
    """
    計算預測結果的查準查全率以及f1

    參數
    ----
    re :DataFrame,預測結果,裏面包含兩列:真實值‘lable_code’、預測值‘pred’
    """
    bins = np.array([0, 0.5, 1])
    label = re["label_code"]
    pred = re["pred"]
    tp, fp, fn, tn = np.histogram2d(label, pred, bins=bins)[0].flatten()
    precision = tp / (tp + fp)  # 0.951
    recall = tp / (tp + fn)  # 0.826
    f1 = 2 * precision * recall / (precision + recall)  # 0.884
    print("查準率: %.3f, 查全率: %.3f, f1: %.3f" % (precision, recall, f1))
In [72]:
# 將數據分爲訓練集和測試集
trainSet, testSet = train_test_split(data, test_size=0.2, random_state=2310)

# 訓練模型並分析模型效果
re = trainModel(trainSet)
modelSummary(re)
interpretModel(re)
Optimization terminated successfully.
         Current function value: 0.406094
         Iterations 8
                           Logit Regression Results                           
==============================================================================
Dep. Variable:             label_code   No. Observations:                26048
Model:                          Logit   Df Residuals:                    26042
Method:                           MLE   Df Model:                            5
Date:                Wed, 22 Aug 2018   Pseudo R-squ.:                  0.2639
Time:                        19:29:31   Log-Likelihood:                -10578.
converged:                       True   LL-Null:                       -14370.
                                        LLR p-value:                     0.000
==================================================================================
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept         -8.2970      0.128    -64.623      0.000      -8.549      -8.045
age                0.0435      0.001     31.726      0.000       0.041       0.046
education_num      0.3215      0.008     42.231      0.000       0.307       0.336
capital_gain       0.0003   1.07e-05     29.650      0.000       0.000       0.000
capital_loss       0.0007   3.64e-05     20.055      0.000       0.001       0.001
hours_per_week     0.0399      0.001     26.995      0.000       0.037       0.043
==================================================================================
檢驗假設education_num的係數等於0:
<F test: F=array([[1783.4276255]]), p=0.0, df_denom=26042, df_num=1>
檢驗假設education_num的係數等於0.32和hours_per_week的係數等於0.04同時成立:
<F test: F=array([[0.01940236]]), p=0.9807846677772952, df_denom=26042, df_num=2>
各個變量對事件發生比的影響:
                    2.5%     97.5%        OR
Intercept       0.000194  0.000321  0.000249
age             1.041611  1.047218  1.044411
education_num   1.358725  1.399879  1.379149
capital_gain    1.000298  1.000340  1.000319
capital_loss    1.000659  1.000802  1.000731
hours_per_week  1.037733  1.043769  1.040746
各個變量的邊際效應:
        Logit Marginal Effects       
=====================================
Dep. Variable:             label_code
Method:                          dydx
At:                           overall
==================================================================================
                    dy/dx    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
age                0.0056      0.000     33.563      0.000       0.005       0.006
education_num      0.0413      0.001     47.313      0.000       0.040       0.043
capital_gain     4.09e-05    1.3e-06     31.500      0.000    3.84e-05    4.34e-05
capital_loss    9.372e-05   4.54e-06     20.648      0.000    8.48e-05       0.000
hours_per_week     0.0051      0.000     28.167      0.000       0.005       0.005
==================================================================================
In [73]:
re = makePrediction(re, testSet)
evaluation(re)
事件發生機率(預測機率)大於0.6的數據個數:
576
事件發生機率(預測機率)大於0.5的數據個數:
834
查準率: 0.951, 查全率: 0.826, f1: 0.884
相關文章
相關標籤/搜索