機器學習算法講堂(一) 十分鐘入門機器學習算法競賽html
比賽地址:https://www.kaggle.com/c/new-york-city-taxi-fare-predictiongit
import pandas as pd import numpy as np # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html file = pd.read_csv('./data/train.csv', nrows = 1000000) print(file.head()) print(file.shape) file = file.dropna(how = 'any', axis = 'rows') #Clean dataset def clean_df(df): return df[(df.fare_amount > 0) & # (df.pickup_longitude > -80) & (df.pickup_longitude < -70) & # (df.pickup_latitude > 35) & (df.pickup_latitude < 45) & # (df.dropoff_longitude > -80) & (df.dropoff_longitude < -70) & # (df.dropoff_latitude > 35) & (df.dropoff_latitude < 45) & (df.passenger_count > 0) & (df.passenger_count < 10)] file = clean_df(file) print(len(file)) print(file.shape) def sphere_dist(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon): """ Return distance along great radius between pickup and dropoff coordinates. """ #Define earth radius (km) R_earth = 6371 #Convert degrees to radians pickup_lat, pickup_lon, dropoff_lat, dropoff_lon = map(np.radians, [pickup_lat, pickup_lon, dropoff_lat, dropoff_lon]) #Compute distances along lat, lon dimensions dlat = dropoff_lat - pickup_lat dlon = dropoff_lon - pickup_lon #Compute haversine distance a = np.sin(dlat/2.0)**2 + np.cos(pickup_lat) * np.cos(dropoff_lat) * np.sin(dlon/2.0)**2 return 2 * R_earth * np.arcsin(np.sqrt(a)) def add_airport_dist(dataset): """ Return minumum distance from pickup or dropoff coordinates to each airport. JFK: John F. Kennedy International Airport EWR: Newark Liberty International Airport LGA: LaGuardia Airport """ jfk_coord = (40.639722, -73.778889) ewr_coord = (40.6925, -74.168611) lga_coord = (40.77725, -73.872611) Washington_Square = (40.4351,-73.5951) pickup_lat = dataset['pickup_latitude'] dropoff_lat = dataset['dropoff_latitude'] pickup_lon = dataset['pickup_longitude'] dropoff_lon = dataset['dropoff_longitude'] pickup_jfk = sphere_dist(pickup_lat, pickup_lon, jfk_coord[0], jfk_coord[1]) dropoff_jfk = sphere_dist(jfk_coord[0], jfk_coord[1], dropoff_lat, dropoff_lon) pickup_ewr = sphere_dist(pickup_lat, pickup_lon, ewr_coord[0], ewr_coord[1]) dropoff_ewr = sphere_dist(ewr_coord[0], ewr_coord[1], dropoff_lat, dropoff_lon) pickup_lga = sphere_dist(pickup_lat, pickup_lon, lga_coord[0], lga_coord[1]) dropoff_lga = sphere_dist(lga_coord[0], lga_coord[1], dropoff_lat, dropoff_lon) pickup_square = sphere_dist(pickup_lat, pickup_lon, Washington_Square[0], Washington_Square[1]) dropoff_square = sphere_dist(Washington_Square[0], Washington_Square[1], dropoff_lat, dropoff_lon) dataset['jfk_dist'] = pd.concat([pickup_jfk, dropoff_jfk], axis=1).min(axis=1) dataset['ewr_dist'] = pd.concat([pickup_ewr, dropoff_ewr], axis=1).min(axis=1) dataset['lga_dist'] = pd.concat([pickup_lga, dropoff_lga], axis=1).min(axis=1) dataset['washington_dist'] = pd.concat([pickup_square, dropoff_square], axis=1).min(axis=1) dataset['longitude_distance'] = abs(dataset['pickup_longitude'] - dataset['dropoff_longitude']) dataset['latitude_distance'] = abs(dataset['pickup_latitude'] - dataset['dropoff_latitude']) # Straight distance dataset['distance_travelled'] = (dataset['longitude_distance'] ** 2 + dataset['latitude_distance'] ** 2) ** .5 dataset['distance_travelled_sin'] = np.sin((dataset['longitude_distance'] ** 2 * dataset['latitude_distance'] ** 2) ** .5) dataset['distance_travelled_cos'] = np.cos((dataset['longitude_distance'] ** 2 * dataset['latitude_distance'] ** 2) ** .5) dataset['distance_travelled_sin_sqrd'] = np.sin((dataset['longitude_distance'] ** 2 * dataset['latitude_distance'] ** 2) ** .5) ** 2 dataset['distance_travelled_cos_sqrd'] = np.cos((dataset['longitude_distance'] ** 2 * dataset['latitude_distance'] ** 2) ** .5) ** 2 # dataset["fare_to_dist_ratio"] = dataset["fare_amount"] / ( dataset["distance_travelled"]+0.0001) # dataset["fare_npassenger_to_dist_ratio"] = (dataset["fare_amount"] / dataset["passenger_count"]) /( dataset["distance_travelled"]+0.0001) dataset['jfk'] = 0 dataset.loc[(dataset['pickup_longitude'] >= -73.7841) & (dataset['pickup_longitude'] <= -73.7721) & (dataset['pickup_latitude'] <= 40.6613) & (dataset['pickup_latitude'] >= 40.6213),'jfk'] = 1 dataset.loc[(dataset['dropoff_longitude'] >= -73.7841) & (dataset['dropoff_longitude'] <= -73.7721) & (dataset['dropoff_latitude'] <= 40.6613) & (dataset['dropoff_latitude'] >= 40.6213),'jfk'] = 1 dataset['lga'] = 0 dataset.loc[(dataset['pickup_longitude'] >= -73.8870) & (dataset['pickup_longitude'] <= -73.8580) & (dataset['pickup_latitude'] <= 40.7800) & (dataset['pickup_latitude'] >= 40.7680),'lga'] = 1 dataset.loc[(dataset['dropoff_longitude'] >= -73.8870) & (dataset['dropoff_longitude'] <= -73.8580) & (dataset['dropoff_latitude'] <= 40.7800) & (dataset['dropoff_latitude'] >= 40.7680),'lga'] = 1 dataset['ewr'] = 0 dataset.loc[(dataset['pickup_longitude'] >= -74.192) & (dataset['pickup_longitude'] <= -74.172) & (dataset['pickup_latitude'] <= 40.708) & (dataset['pickup_latitude'] >= 40.676),'ewr'] = 1 dataset.loc[(dataset['dropoff_longitude'] >= -74.192) & (dataset['dropoff_longitude'] <= -74.172) & (dataset['dropoff_latitude'] <= 40.708) & (dataset['dropoff_latitude'] >= 40.676),'ewr'] = 1 return dataset def add_datetime_info(dataset): #Convert to datetime format dataset['pickup_datetime'] = pd.to_datetime(dataset['pickup_datetime'],format="%Y-%m-%d %H:%M:%S UTC") # dataset['second'] = dataset.pickup_datetime.dt.second dataset['hour'] = dataset.pickup_datetime.dt.hour dataset['day'] = dataset.pickup_datetime.dt.day dataset['month'] = dataset.pickup_datetime.dt.month dataset['weekday'] = dataset.pickup_datetime.dt.weekday dataset['year'] = dataset.pickup_datetime.dt.year # dataset['all_time'] = dataset['second'] + 60*dataset['hour'] + 24*60*dataset['day']+30*24*60*dataset['month'] return dataset file = add_datetime_info(file) file = add_airport_dist(file) file = file.drop(columns=['pickup_datetime']) #'distance_travelled_sin_sqrd','passenger_count','distance_travelled_cos_sqrd']) file['distance'] = sphere_dist(file['pickup_latitude'], file['pickup_longitude'], file['dropoff_latitude'] , file['dropoff_longitude']) file.head() test_file = pd.read_csv('./data/test.csv') test_file = add_datetime_info(test_file) test_file = add_airport_dist(test_file) test_file = test_file.drop(columns=['pickup_datetime']) #, 'distance_travelled_sin_sqrd','passenger_count','distance_travelled_cos_sqrd']) test_file['distance'] = sphere_dist(test_file['pickup_latitude'], test_file['pickup_longitude'], test_file['dropoff_latitude'] , test_file['dropoff_longitude']) test_file.head() import datetime as dt from sklearn.model_selection import train_test_split import xgboost as xgb import os train_x = file.drop(columns=['fare_amount']) y = file['fare_amount'] new_test = test_file from sklearn.preprocessing import LabelEncoder for c in train_x.columns: if train_x[c].dtype == 'datetime64[ns]' or train_x[c].dtype == 'object': lbl = LabelEncoder() lbl.fit(list(train_x[c].values) + list(test_file[c].values)) train_x[c] = lbl.transform(list(train_x[c].values)) test_file[c] = lbl.transform(list(test_file[c].values)) print(test_file.head()) x_train,x_test,y_train,y_test = train_test_split(train_x,y,random_state=0,test_size=0.01) ''' for x in range(0,len(x_train['pickup_datetime'])): try: time = '' for time_ac in str(x_train['pickup_datetime'].loc[x]): if time_ac <= '9' and time_ac >= '0': time = time + time_ac x_train['pickup_datetime'].loc[x] = time except: x_train['pickup_datetime'].loc[x] = 0 x_train['pickup_datetime'].astype('int64') ''' print(x_train.dtypes) print(x_train.head) ''' dtrain = xgb.DMatrix(x_train, y_train) dtest = xgb.DMatrix(x_test, y_test) eta = 0.1 max_depth = 8 subsample = 0.8 colsample_bytree = 0.8 print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree)) params = { "objective": "reg:linear", "booster" : "gbtree", "eval_metric": "rmse", "eta": eta, "max_depth": max_depth, "subsample": subsample, "colsample_bytree": colsample_bytree, "silent": 1, "seed": 19960429 } watchlist = [(dtrain,'train'),(dtest,'val')] num_round = 3000 early_stopping_rounds=50 bst = xgb.train(params, dtrain, num_round, watchlist,early_stopping_rounds=early_stopping_rounds) ''' import lightgbm as lgb from sklearn.model_selection import train_test_split from sklearn import metrics from sklearn.metrics import mean_squared_error from sklearn.model_selection import KFold import matplotlib.pylab as plt # Keep Relevant Variables.. trainshape = train_x.shape testshape = test_file.shape # print("\nTrain DF..") # train = reduce_mem_usage(train) # print("\nTest DF..") # test_df = reduce_mem_usage(test_df) # LGBM Dataset Formating dtrain = lgb.Dataset(train_x, label=y, free_raw_data=False) print("Light Gradient Boosting Regressor: ") lgbm_params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'rmse', 'max_depth':7, 'learning_rate':.1, 'subsample': 0.8, 'colsample_bytree': 0.8 } folds = KFold(n_splits=5, shuffle=True, random_state=1) fold_preds = np.zeros(testshape[0]) oof_preds = np.zeros(trainshape[0]) dtrain.construct() # Fit 5 Folds modelstart = time.time() for trn_idx, val_idx in folds.split(file): clf = lgb.train( params=lgbm_params, train_set=dtrain.subset(trn_idx), valid_sets=dtrain.subset(val_idx), num_boost_round=17000, early_stopping_rounds=250, verbose_eval=500 ) oof_preds[val_idx] = clf.predict(dtrain.data.iloc[val_idx]) fold_preds += clf.predict(test_file) / folds.n_splits print(mean_squared_error(y.iloc[val_idx], oof_preds[val_idx]) ** .5) # lgb.plot_importance(clf, max_num_features=30) print("Model Runtime: %0.2f Minutes"%((time.time() - modelstart)/60)) import time Ttest = xgb.DMatrix(test_file) # ypred = bst.predict(Ttest) ypred = fold_preds new_test = pd.read_csv('./data/test.csv') output = pd.DataFrame({ 'key' : new_test['key'], 'fare_amount': ypred }) print(output.head()) dt = time.strftime('%Y%m%d%H%M%S',time.localtime()) output.to_csv('.//data//ans'+str(dt)+'.csv', index = False)