In [57]:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from statsmodels.graphics.mosaicplot import mosaic

%pylab inline
mpl.rcParams['font.sans-serif'] = ['SimHei']  # 指定默認字體
mpl.rcParams['axes.unicode_minus'] = False  # 解決保存圖像是負號 '-' 顯示爲方塊的問題

Populating the interactive namespace from numpy and matplotlib

載入和分析數據¶

In [4]:

def readData(path):
    """
    使用pandas讀取數據
    """
    data = pd.read_csv(path)
    cols = ["age", "education_num", "capital_gain", "capital_loss", "hours_per_week", "label"]
    return data[cols]

data = readData('dataset/adult.data')

In [5]:

data.hist(rwidth=0.9, grid=False, figsize=(8, 8), alpha=0.6)   # 各個變量的直方圖
plt.show()

In [6]:

data.head()

Out[6]:

	age	education_num	capital_gain	hours_per_week	label
0	39	13	2174	40	<=50K
1	50	13	0	13	<=50K
2	38	9	0	40	<=50K
3	53	7	0	40	<=50K
4	28	13	0	40	<=50K

In [8]:

data['label_code'] = pd.Categorical(data.label).codes   # 將文字變量轉化爲數字變量

In [10]:

data[['label', 'label_code']].head(8)

Out[10]:

	label	label_code
0	<=50K	0
1	<=50K	0
2	<=50K	0
3	<=50K	0
4	<=50K	0
5	<=50K	0
6	<=50K	0
7	>50K	1

In [18]:

def visualData(df):
    """
    畫直方圖，直觀瞭解數據
    """
    df.hist(
        rwidth=0.9, grid=False, figsize=(8, 8), alpha=0.6, color="grey")
    plt.show(block=False)

In [19]:

df = data[["age", "education_num", "capital_gain", "capital_loss", "hours_per_week", "label_code"]]
visualData(df)

In [20]:

data.describe()  # 僅僅顯示數值型變量的基本統計信息，若要顯示所有變量的統計信息只需傳入參數 `include='all'`

Out[20]:

	age	education_num	capital_gain	capital_loss	hours_per_week	label_code
count	32561.000000	32561.000000	32561.000000	32561.000000	32561.000000	32561.000000
mean	38.581647	10.080679	1077.648844	87.303830	40.437456	0.240810
std	13.640433	2.572720	7385.292085	402.960219	12.347429	0.427581
min	17.000000	1.000000	0.000000	0.000000	1.000000	0.000000
25%	28.000000	9.000000	0.000000	0.000000	40.000000	0.000000
50%	37.000000	10.000000	0.000000	0.000000	40.000000	0.000000
75%	48.000000	12.000000	0.000000	0.000000	45.000000	0.000000
max	90.000000	16.000000	99999.000000	4356.000000	99.000000	1.000000

交叉報表¶

交叉報表用來描述兩個變量之間的關係。javascript

按四分位數劃分爲 $4$ 個區間

In [23]:

qeducation_num = pd.qcut(data['education_num'], [.25, .5, .75, 1])  
qeducation_num

Out[23]:

0         (12.0, 16.0]
1         (12.0, 16.0]
2        (8.999, 10.0]
3                  NaN
4         (12.0, 16.0]
5         (12.0, 16.0]
6                  NaN
7        (8.999, 10.0]
8         (12.0, 16.0]
9         (12.0, 16.0]
10       (8.999, 10.0]
11        (12.0, 16.0]
12        (12.0, 16.0]
13        (10.0, 12.0]
14        (10.0, 12.0]
15                 NaN
16       (8.999, 10.0]
17       (8.999, 10.0]
18                 NaN
19        (12.0, 16.0]
20        (12.0, 16.0]
21       (8.999, 10.0]
22                 NaN
23                 NaN
24       (8.999, 10.0]
25        (12.0, 16.0]
26       (8.999, 10.0]
27       (8.999, 10.0]
28       (8.999, 10.0]
29       (8.999, 10.0]
             ...      
32531     (12.0, 16.0]
32532     (12.0, 16.0]
32533     (12.0, 16.0]
32534    (8.999, 10.0]
32535              NaN
32536     (12.0, 16.0]
32537    (8.999, 10.0]
32538     (12.0, 16.0]
32539     (12.0, 16.0]
32540    (8.999, 10.0]
32541    (8.999, 10.0]
32542    (8.999, 10.0]
32543     (10.0, 12.0]
32544     (12.0, 16.0]
32545     (10.0, 12.0]
32546     (10.0, 12.0]
32547    (8.999, 10.0]
32548     (12.0, 16.0]
32549    (8.999, 10.0]
32550    (8.999, 10.0]
32551              NaN
32552     (10.0, 12.0]
32553     (12.0, 16.0]
32554     (12.0, 16.0]
32555    (8.999, 10.0]
32556     (10.0, 12.0]
32557    (8.999, 10.0]
32558    (8.999, 10.0]
32559    (8.999, 10.0]
32560    (8.999, 10.0]
Name: education_num, Length: 32561, dtype: category
Categories (3, interval[float64]): [(8.999, 10.0] < (10.0, 12.0] < (12.0, 16.0]]

計算 education_num 和 label 的交叉報表

In [48]:

cross1 = pd.crosstab(data['label'], qeducation_num)
cross1

Out[48]:

education_num	(8.999, 10.0]	(10.0, 12.0]	(12.0, 16.0]
label
<=50K	14730	1823	4158
>50K	3062	626	3909

將 cross1 圖表化css

In [65]:

props = lambda key: {"color": "0.45"} if ' >50K' in key else {"color": "#C6E2FF"}
mosaic(cross1.stack(), gap=0.05,properties=props, title='label 和 education_num 的交叉報表')
plt.show()

搭建模型，並訓練模型¶

statsmodels 建模時能夠使用相似文字的表達式:html

formula 定義了模型的形式, ~ 至關於等號.html5

In [67]:

def trainModel(data):
    """
    搭建邏輯迴歸模型，並訓練模型
    """
    formula = "label_code ~ age + education_num + capital_gain + capital_loss + hours_per_week"
    model = sm.Logit.from_formula(formula, data=data)
    re = model.fit()
    return re

In [70]:

def modelSummary(re):
    """
    分析邏輯迴歸模型的統計性質
    """
    # 總體統計分析結果
    print(re.summary())
    # 用f test檢驗education_num的係數是否顯著
    print("檢驗假設education_num的係數等於0：")
    print(re.f_test("education_num=0"))
    # 用f test檢驗兩個假設是否同時成立
    print("檢驗假設education_num的係數等於0.32和hours_per_week的係數等於0.04同時成立：")
    print(re.f_test("education_num=0.32, hours_per_week=0.04"))

def interpretModel(re):
    """
    理解模型結果

    參數
    ----
    re ：BinaryResults，訓練好的邏輯迴歸模型
    """
    conf = re.conf_int()
    conf['OR'] = re.params
    # 計算各個變量對事件發生比的影響
    # conf裏面的三列，分別對應着估計值的下界、上界和估計值自己
    conf.columns = ['2.5%', '97.5%', 'OR']
    print("各個變量對事件發生比的影響：")
    print(np.exp(conf))
    # 計算各個變量的邊際效應
    print("各個變量的邊際效應：")
    print(re.get_margeff(at="overall").summary())


def makePrediction(re, testSet, alpha=0.5):
    """
    使用訓練好的模型對測試數據作預測
    """
    # 關閉pandas有關chain_assignment的警告
    pd.options.mode.chained_assignment = None
    # 計算事件發生的機率
    testSet["prob"] = re.predict(testSet)
    print("事件發生機率（預測機率）大於0.6的數據個數：")
    print(testSet[testSet["prob"] > 0.6].shape[0])  # 輸出值爲576
    print("事件發生機率（預測機率）大於0.5的數據個數：")
    print(testSet[testSet["prob"] > 0.5].shape[0])  # 輸出值爲834
    # 根據預測的機率，得出最終的預測
    testSet["pred"] = testSet.apply(lambda x: 1 if x["prob"] > alpha else 0, axis=1)
    return testSet


def evaluation(re):
    """
    計算預測結果的查準查全率以及f1

    參數
    ----
    re ：DataFrame，預測結果，裏面包含兩列：真實值‘lable_code’、預測值‘pred’
    """
    bins = np.array([0, 0.5, 1])
    label = re["label_code"]
    pred = re["pred"]
    tp, fp, fn, tn = np.histogram2d(label, pred, bins=bins)[0].flatten()
    precision = tp / (tp + fp)  # 0.951
    recall = tp / (tp + fn)  # 0.826
    f1 = 2 * precision * recall / (precision + recall)  # 0.884
    print("查準率: %.3f, 查全率: %.3f, f1: %.3f" % (precision, recall, f1))

In [72]:

# 將數據分爲訓練集和測試集
trainSet, testSet = train_test_split(data, test_size=0.2, random_state=2310)

# 訓練模型並分析模型效果
re = trainModel(trainSet)
modelSummary(re)
interpretModel(re)

Optimization terminated successfully.
         Current function value: 0.406094
         Iterations 8
                           Logit Regression Results                           
==============================================================================
Dep. Variable:             label_code   No. Observations:                26048
Model:                          Logit   Df Residuals:                    26042
Method:                           MLE   Df Model:                            5
Date:                Wed, 22 Aug 2018   Pseudo R-squ.:                  0.2639
Time:                        19:29:31   Log-Likelihood:                -10578.
converged:                       True   LL-Null:                       -14370.
                                        LLR p-value:                     0.000
==================================================================================
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept         -8.2970      0.128    -64.623      0.000      -8.549      -8.045
age                0.0435      0.001     31.726      0.000       0.041       0.046
education_num      0.3215      0.008     42.231      0.000       0.307       0.336
capital_gain       0.0003   1.07e-05     29.650      0.000       0.000       0.000
capital_loss       0.0007   3.64e-05     20.055      0.000       0.001       0.001
hours_per_week     0.0399      0.001     26.995      0.000       0.037       0.043
==================================================================================
檢驗假設education_num的係數等於0：
<F test: F=array([[1783.4276255]]), p=0.0, df_denom=26042, df_num=1>
檢驗假設education_num的係數等於0.32和hours_per_week的係數等於0.04同時成立：
<F test: F=array([[0.01940236]]), p=0.9807846677772952, df_denom=26042, df_num=2>
各個變量對事件發生比的影響：
                    2.5%     97.5%        OR
Intercept       0.000194  0.000321  0.000249
age             1.041611  1.047218  1.044411
education_num   1.358725  1.399879  1.379149
capital_gain    1.000298  1.000340  1.000319
capital_loss    1.000659  1.000802  1.000731
hours_per_week  1.037733  1.043769  1.040746
各個變量的邊際效應：
        Logit Marginal Effects       
=====================================
Dep. Variable:             label_code
Method:                          dydx
At:                           overall
==================================================================================
                    dy/dx    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
age                0.0056      0.000     33.563      0.000       0.005       0.006
education_num      0.0413      0.001     47.313      0.000       0.040       0.043
capital_gain     4.09e-05    1.3e-06     31.500      0.000    3.84e-05    4.34e-05
capital_loss    9.372e-05   4.54e-06     20.648      0.000    8.48e-05       0.000
hours_per_week     0.0051      0.000     28.167      0.000       0.005       0.005
==================================================================================

In [73]:

re = makePrediction(re, testSet)
evaluation(re)

事件發生機率（預測機率）大於0.6的數據個數：
576
事件發生機率（預測機率）大於0.5的數據個數：
834
查準率: 0.951, 查全率: 0.826, f1: 0.884

python: 模型的統計信息

載入和分析數據¶

交叉報表¶

搭建模型，並訓練模型¶