《機器學習及實踐--從零開始通往Kaggle競賽之路》

時間 2020-03-24

標籤機器學習實踐開始通往 kaggle 競賽之路简体版

原文原文鏈接

在開始說以前一個很重要的Tip：電腦至少要求是64位的，這是個人痛。python

斷斷續續花了個把月的時間把這本書過了一遍。這是一本很是適合基於python入門的機器學習入門的書籍，全書通俗易懂且有代碼提供。書中源代碼鏈接爲Ipython環境。主頁君使用的是pycharm，python2.7，具體安轉過程書本寫的很詳細。碼完書中代碼，有一點點點小不符（或許多是由於平臺不同），百度基本能夠解決問題（有問題也能夠留言探討）。貼一點代碼，以示學習：算法

1_4_7.py:數組

#coding=utf-8
# Filename : 良性、惡性乳腺癌腫瘤預測 完整代碼樣例(線性分類器)

#導入pandas工具包

import pandas as pd
#調用pandas工具包的read_csv函數模塊，傳入訓練文件地址參數，得到返回的數據而且存入變量df_train
df_train=pd.read_csv('breast-cancer-train.csv')
df_test=pd.read_csv('breast-cancer-test.csv')
#選取clumpthickness與cellsize做爲特徵，構建測試集中的正負分類樣本
df_test_negative=df_test.loc[df_test['Type']==0][['Clump Thickness', 'Cell Size']]
df_test_positive=df_test.loc[df_test['Type']==1][['Clump Thickness', 'Cell Size']]

import matplotlib.pyplot as plt
#繪製良性腫瘤樣本點標記爲紅的o
plt.scatter(df_test_negative['Clump Thickness'], df_test_negative['Cell Size'], marker='o', s=200, c='red')
plt.scatter(df_test_positive['Clump Thickness'], df_test_positive['Cell Size'], marker='x', s=150, c='black')
#繪製x，y軸
plt.xlabel('Clump Thickness')
plt.ylabel('Cell Size')
plt.title('1-2')
#顯示圖
#plt.show()
#導入
import numpy as np
#利用random函數隨機採樣直線的係數與截距
intercept=np.random.random([1])
coef=np.random.random([2])
print coef,intercept
lx=np.arange(0, 12)#建立等差數組
ly=(-intercept-lx*coef[0])/coef[1]#截距式
plt.plot(lx,ly,c='yellow')#繪隨機直線

plt.scatter(df_test_negative['Clump Thickness'], df_test_negative['Cell Size'], marker='o', s=200, c='red')
plt.scatter(df_test_positive['Clump Thickness'], df_test_positive['Cell Size'], marker='x', s=150, c='black')
#繪製x，y軸
plt.xlabel('Clump Thickness')
plt.ylabel('Cell Size')
plt.title('1-3')
#顯示圖
plt.show()

#導入sklearn的邏輯斯蒂迴歸分類器
from sklearn.linear_model import LinearRegression
lr=LinearRegression()

lr.fit(df_train[['Clump Thickness','Cell Size']][0:10],df_train['Type'][0:10])
print 'Testing accuracy (10 training sample):',lr.score(df_test[['Clump Thickness' , 'Cell Size']],df_test['Type'])
print "你好，中國"
#第二次
intercept=lr.intercept_
coef=lr.coef_[:2]
print coef,intercept
ly=(-intercept-lx*coef[0])/coef[1]
plt.plot(lx,ly,c='green')
plt.scatter(df_test_negative['Clump Thickness'], df_test_negative['Cell Size'], marker='o', s=200, c='red')
plt.scatter(df_test_positive['Clump Thickness'], df_test_positive['Cell Size'], marker='x', s=150, c='black')
#繪製x，y軸
plt.xlabel('Clump Thickness')
plt.ylabel('Cell Size')
plt.title('1-4')
#顯示圖
plt.show()

lr=LinearRegression()
lr.fit(df_train[['Clump Thickness' , 'Cell Size']][:10],df_train['Type'][:10])
print 'Testing accuracy (all training sample):',lr.score(df_test[['Clump Thickness' , 'Cell Size']],df_test['Type'])
#第三次
intercept=lr.intercept_
coef=lr.coef_[:2]
ly=(-intercept-lx*coef[0])/coef[1]
plt.plot(lx,ly,c='blue')
plt.scatter(df_test_negative['Clump Thickness'], df_test_negative['Cell Size'], marker='o', s=200, c='red')
plt.scatter(df_test_positive['Clump Thickness'], df_test_positive['Cell Size'], marker='x', s=150, c='black')
#繪製x，y軸
plt.xlabel('Clump Thickness')
plt.ylabel('Cell Size')
plt.title('1-5')
#顯示圖
plt.show()
print 'end'

2_1_2_1.py:dom

# coding=utf-8#

__author__ = 'lenovo'
#線性迴歸器預測美國波士頓地區房價

#從包中導入房價數據
from sklearn.datasets import load_boston
boston=load_boston()
print boston.DESCR

#導入數據分割器
from sklearn.cross_validation import train_test_split
import numpy as np
#導入有價值數據
x=boston.data
y=boston.target
#print x,y

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=33)
#分析迴歸目標值的差別
print "The max target value is",np.max(boston.target)
print "The min target value is",np.min(boston.target)
print "The average target value is",np.mean(boston.target)
#有輸出結果看目標值差別較大，需進行標準化處理
from sklearn.preprocessing import StandardScaler
#初始化特徵和目標值的標準化器
ss_x=StandardScaler()
ss_y=StandardScaler()
#對訓練數據和測試數據標準化
x_train=ss_x.fit_transform(x_train)#訓練算法，設置內部參數,數據轉換
x_test=ss_x.transform(x_test)#數據轉換
y_train=ss_y.fit_transform(y_train)
y_test=ss_y.transform(y_test)

#使用LR與SGDRegression對房價進行預測
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
#使用訓練數據進行參數估計
lr.fit(x_train,y_train)
#對測試數據進行迴歸預測
lr_y_predition=lr.predict(x_test)

from sklearn.linear_model import SGDRegressor
sgdr=SGDRegressor()
sgdr.fit(x_train,y_train)
sgdr_y_predict=sgdr.predict(x_test)

#三種迴歸評價機制以及兩種調用R-squared評價模塊的方法，對本節模型的迴歸性能作出評價
print 'The avlue of default measurement of LinearRegression is',lr.score(x_test,y_test)
#從sklearn.metrics依次導入r2_score,mean_squared_error,mean_absolute_error
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
#使用r2_score模塊，並輸出評估結果
print 'The value of R-Squared of LinearRegression is',r2_score(y_test,lr_y_predition)
#使用mean_squared_error模塊，並輸出評估結果
print 'The mean squared error of LinearRegression is',mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(lr_y_predition))
#使用mean_absolute_error模塊，並輸出評估結果
print 'The mean absoluate error of LinearRegression is',mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(lr_y_predition))

#使用SGDRegressor模塊自帶的評估模塊，並輸出評估模塊
print 'The value of default measurement of SGDRegressor is',sgdr.score(x_test,y_test)
#使用r2_score模塊，並輸出評估結果
print 'The value of R-Squared of SGDRegressor is',r2_score(y_test,sgdr_y_predict)
#使用mean_squared_error模塊，並輸出評估結果
print 'The mean squared error of SGDRegressor is',mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(sgdr_y_predict))
#使用mean_absolute_error模塊，並輸出評估結果
print 'The mean absoluate error of SGDRegressor is',mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(sgdr_y_predict))

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。