import pandas as pd
import matplotlib.pyplot as plt #導入圖像庫
from sklearn.ensemble import RandomForestRegressor
# 用隨機森林對缺失值預測填充函數
def set_missing(df):
# 把已有的數值型特徵取出來
process_df = df.ix[:,[5,0,1,2,3,4,6,7,8,9]]
# 分紅已知該特徵和未知該特徵兩部分
known = process_df[process_df.MonthlyIncome.notnull()].as_matrix()
unknown = process_df[process_df.MonthlyIncome.isnull()].as_matrix()
# X爲特徵屬性值
X = known[:, 1:]
# y爲結果標籤值
y = known[:, 0]
# fit到RandomForestRegressor之中
rfr = RandomForestRegressor(random_state=0, n_estimators=200,max_depth=3,n_jobs=-1)
rfr.fit(X,y)
# 用獲得的模型進行未知特徵值預測
predicted = rfr.predict(unknown[:, 1:]).round(0)
print(predicted)
# 用獲得的預測結果填補原缺失數據
df.loc[(df.MonthlyIncome.isnull()), 'MonthlyIncome'] = predicted
return df
data = pd.read_csv(r'E:\Python\Source\CreditScore\cs-training.csv')
process_df = data.iloc[:,[5,0,1,2,3,4,6,7,8,9]]
known = process_df[process_df.MonthlyIncome.notnull()].as_matrix()
unknown = process_df[process_df.MonthlyIncome.isnull()].as_matrix()
X = known[:, 1:]
y = known[:, 0]
# fit到RandomForestRegressor之中
rfr = RandomForestRegressor(random_state=0, n_estimators=200,max_depth=3,n_jobs=-1)
rfr.fit(X,y)
# 用獲得的模型進行未知特徵值預測
predicted = rfr.predict(unknown[:, 1:]).round(0)
print(predicted)
data.loc[(data.MonthlyIncome.isnull()), 'MonthlyIncome'] = predicted
data=data.dropna()#刪除比較少的缺失值
data = data.drop_duplicates()#刪除重複項
#異常值處理
#x1 = data["age"]
x2 = data["RevolvingUtilizationOfUnsecuredLines"]
x3 = data["DebtRatio"]
fig = plt.figure(1)
ax = fig.add_subplot(111)
ax.boxplot([x2,x3])
ax.set_xticklabels(["RevolvingUtilizationOfUnsecuredLines","DebtRatio"])
#異常值處理
data = data[data["age"] > 0]
data = data[data['NumberOfTime30-59DaysPastDueNotWorse'] < 90]#剔除異常值
# 好壞客戶的總體分析
data['SeriousDlqin2yrs']=1-data['SeriousDlqin2yrs']
grouped = data["SeriousDlqin2yrs"].groupby(data["SeriousDlqin2yrs"]).count()
print("壞客戶佔比:{:.2%}".format(grouped[0]/grouped[1]))
print(grouped)
grouped.plot(kind="bar")
Y = data['SeriousDlqin2yrs']
本文經過對kaggle上的Give Me Some Credit數據的挖掘分析,結合信用評分卡的創建原理,從數據的預處理、變量選擇、建模分析到建立信用評分,建立了一個簡單的信用評分系統。本項目還有許多不足之處,好比分箱應當使用最優分箱或卡方分箱,減小人爲分箱的隨機性,此外模型採用的是邏輯迴歸算法,還能夠多多嘗試其餘模型。
html