Talk is cheap, show me the executable code!dom
上文書說到樣本劃分,那具體如何操做呢?
立刻來舉個栗子:函數
import sklearn import numpy as np # 生成數據 X = np.linspace(1,10,10) Y = 2*X # 隨機打亂 X,Y = sklearn.utils.shuffle(X,Y,random_state=42) #按比例,依照打亂順序進行劃分 FRAC = 0.8 sp = int(len(X)*FRAC) # 取得訓練集合測試集 X_train, Y_train = X[:sp],Y[:sp] X_test, Y_test = X[sp:],Y[sp:] # =====output===== print(X_train,Y_train,X_test,Y_test)
2.若是原數據是dataframe,可使用sample函數進行打亂學習
import numpy as np import pandas as pd # 生成數據 X = np.linspace(1,10,30) data_df = pd.DataFrame(list(zip(X,2*X)),columns=['data','pre_data']) #利用sample函數打亂數據 data_df.sample(frac=1.0,random_state=42) # 按比例,依照打亂順序進行劃分 FRAC = 0.8 sp = int(data_df.shape[0]*FRAC) #取得訓練集合測試集 X_train,Y_train = data_df.loc[:sp-1]['data'],data_df.loc[:sp-1]['pre_data'] X_test, Y_test = data_df.loc[sp:]['data'],data_df.loc[sp:]['pre_data'] #怕數據條數不對應,能夠利用assert驗證 assert(X_train.shape[0]+X_test.shape[0]==data_df.shape[0])
import torch import torch.utils.data # 生成數據 X = list(zip(np.linspace(1,10,10),2*np.linspace(1,10,10))) dataset = torch.utils.data.DataLoader(X) # 按比例劃分,函數自帶打亂功能 FRAC = 0.8 sp = int(len(dataset)*FRAC) t_sp = len(dataset)-sp dataset_train,dataset_test = torch.utils.data.random_split(dataset,[sp,t_sp]) # =====output===== dataset_train.indices,dataset_test.indices