在開始說以前一個很重要的Tip:電腦至少要求是64位的,這是個人痛。python
斷斷續續花了個把月的時間把這本書過了一遍。這是一本很是適合基於python入門的機器學習入門的書籍,全書通俗易懂且有代碼提供。書中源代碼鏈接爲Ipython環境。主頁君使用的是pycharm,python2.7,具體安轉過程書本寫的很詳細。碼完書中代碼,有一點點點小不符(或許多是由於平臺不同),百度基本能夠解決問題(有問題也能夠留言探討)。貼一點代碼,以示學習:算法
1_4_7.py:數組
#coding=utf-8 # Filename : 良性、惡性乳腺癌腫瘤預測 完整代碼樣例(線性分類器) #導入pandas工具包 import pandas as pd #調用pandas工具包的read_csv函數模塊,傳入訓練文件地址參數,得到返回的數據而且存入變量df_train df_train=pd.read_csv('breast-cancer-train.csv') df_test=pd.read_csv('breast-cancer-test.csv') #選取clumpthickness與cellsize做爲特徵,構建測試集中的正負分類樣本 df_test_negative=df_test.loc[df_test['Type']==0][['Clump Thickness', 'Cell Size']] df_test_positive=df_test.loc[df_test['Type']==1][['Clump Thickness', 'Cell Size']] import matplotlib.pyplot as plt #繪製良性腫瘤樣本點標記爲紅的o plt.scatter(df_test_negative['Clump Thickness'], df_test_negative['Cell Size'], marker='o', s=200, c='red') plt.scatter(df_test_positive['Clump Thickness'], df_test_positive['Cell Size'], marker='x', s=150, c='black') #繪製x,y軸 plt.xlabel('Clump Thickness') plt.ylabel('Cell Size') plt.title('1-2') #顯示圖 #plt.show() #導入 import numpy as np #利用random函數隨機採樣直線的係數與截距 intercept=np.random.random([1]) coef=np.random.random([2]) print coef,intercept lx=np.arange(0, 12)#建立等差數組 ly=(-intercept-lx*coef[0])/coef[1]#截距式 plt.plot(lx,ly,c='yellow')#繪隨機直線 plt.scatter(df_test_negative['Clump Thickness'], df_test_negative['Cell Size'], marker='o', s=200, c='red') plt.scatter(df_test_positive['Clump Thickness'], df_test_positive['Cell Size'], marker='x', s=150, c='black') #繪製x,y軸 plt.xlabel('Clump Thickness') plt.ylabel('Cell Size') plt.title('1-3') #顯示圖 plt.show() #導入sklearn的邏輯斯蒂迴歸分類器 from sklearn.linear_model import LinearRegression lr=LinearRegression() lr.fit(df_train[['Clump Thickness','Cell Size']][0:10],df_train['Type'][0:10]) print 'Testing accuracy (10 training sample):',lr.score(df_test[['Clump Thickness' , 'Cell Size']],df_test['Type']) print "你好,中國" #第二次 intercept=lr.intercept_ coef=lr.coef_[:2] print coef,intercept ly=(-intercept-lx*coef[0])/coef[1] plt.plot(lx,ly,c='green') plt.scatter(df_test_negative['Clump Thickness'], df_test_negative['Cell Size'], marker='o', s=200, c='red') plt.scatter(df_test_positive['Clump Thickness'], df_test_positive['Cell Size'], marker='x', s=150, c='black') #繪製x,y軸 plt.xlabel('Clump Thickness') plt.ylabel('Cell Size') plt.title('1-4') #顯示圖 plt.show() lr=LinearRegression() lr.fit(df_train[['Clump Thickness' , 'Cell Size']][:10],df_train['Type'][:10]) print 'Testing accuracy (all training sample):',lr.score(df_test[['Clump Thickness' , 'Cell Size']],df_test['Type']) #第三次 intercept=lr.intercept_ coef=lr.coef_[:2] ly=(-intercept-lx*coef[0])/coef[1] plt.plot(lx,ly,c='blue') plt.scatter(df_test_negative['Clump Thickness'], df_test_negative['Cell Size'], marker='o', s=200, c='red') plt.scatter(df_test_positive['Clump Thickness'], df_test_positive['Cell Size'], marker='x', s=150, c='black') #繪製x,y軸 plt.xlabel('Clump Thickness') plt.ylabel('Cell Size') plt.title('1-5') #顯示圖 plt.show() print 'end'
2_1_2_1.py:dom
# coding=utf-8# __author__ = 'lenovo' #線性迴歸器預測美國波士頓地區房價 #從包中導入房價數據 from sklearn.datasets import load_boston boston=load_boston() print boston.DESCR #導入數據分割器 from sklearn.cross_validation import train_test_split import numpy as np #導入有價值數據 x=boston.data y=boston.target #print x,y x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=33) #分析迴歸目標值的差別 print "The max target value is",np.max(boston.target) print "The min target value is",np.min(boston.target) print "The average target value is",np.mean(boston.target) #有輸出結果看目標值差別較大,需進行標準化處理 from sklearn.preprocessing import StandardScaler #初始化特徵和目標值的標準化器 ss_x=StandardScaler() ss_y=StandardScaler() #對訓練數據和測試數據標準化 x_train=ss_x.fit_transform(x_train)#訓練算法,設置內部參數,數據轉換 x_test=ss_x.transform(x_test)#數據轉換 y_train=ss_y.fit_transform(y_train) y_test=ss_y.transform(y_test) #使用LR與SGDRegression對房價進行預測 from sklearn.linear_model import LinearRegression lr=LinearRegression() #使用訓練數據進行參數估計 lr.fit(x_train,y_train) #對測試數據進行迴歸預測 lr_y_predition=lr.predict(x_test) from sklearn.linear_model import SGDRegressor sgdr=SGDRegressor() sgdr.fit(x_train,y_train) sgdr_y_predict=sgdr.predict(x_test) #三種迴歸評價機制以及兩種調用R-squared評價模塊的方法,對本節模型的迴歸性能作出評價 print 'The avlue of default measurement of LinearRegression is',lr.score(x_test,y_test) #從sklearn.metrics依次導入r2_score,mean_squared_error,mean_absolute_error from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error #使用r2_score模塊,並輸出評估結果 print 'The value of R-Squared of LinearRegression is',r2_score(y_test,lr_y_predition) #使用mean_squared_error模塊,並輸出評估結果 print 'The mean squared error of LinearRegression is',mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(lr_y_predition)) #使用mean_absolute_error模塊,並輸出評估結果 print 'The mean absoluate error of LinearRegression is',mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(lr_y_predition)) #使用SGDRegressor模塊自帶的評估模塊,並輸出評估模塊 print 'The value of default measurement of SGDRegressor is',sgdr.score(x_test,y_test) #使用r2_score模塊,並輸出評估結果 print 'The value of R-Squared of SGDRegressor is',r2_score(y_test,sgdr_y_predict) #使用mean_squared_error模塊,並輸出評估結果 print 'The mean squared error of SGDRegressor is',mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(sgdr_y_predict)) #使用mean_absolute_error模塊,並輸出評估結果 print 'The mean absoluate error of SGDRegressor is',mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(sgdr_y_predict))