CCF2020 大數據時代的Serverless工做負載預測賽道baseline:(0.211+)
附一下官網連接:https://www.datafountain.cn/competitions/468python
以前剛開賽的時候,寫了個baseline,分享一下。目前排名有點落後,你們隨便看看。app
import pandas as pd from datetime import datetime import time from sklearn.preprocessing import LabelEncoder
def make_label(data): '''構建十個目標 當作多目標任務作 沒有用多目標預測的庫 而是分開預測的 這裏應該可以上點分''' data['CPU_USAGE_1']=data.CPU_USAGE.shift(-1) data['CPU_USAGE_2']=data.CPU_USAGE.shift(-2) data['CPU_USAGE_3']=data.CPU_USAGE.shift(-3) data['CPU_USAGE_4']=data.CPU_USAGE.shift(-4) data['CPU_USAGE_5']=data.CPU_USAGE.shift(-5) data['LAUNCHING_JOB_NUMS_1']=data.LAUNCHING_JOB_NUMS.shift(-1) data['LAUNCHING_JOB_NUMS_2']=data.LAUNCHING_JOB_NUMS.shift(-2) data['LAUNCHING_JOB_NUMS_3']=data.LAUNCHING_JOB_NUMS.shift(-3) data['LAUNCHING_JOB_NUMS_4']=data.LAUNCHING_JOB_NUMS.shift(-4) data['LAUNCHING_JOB_NUMS_5']=data.LAUNCHING_JOB_NUMS.shift(-5) ''' 由於使用shift 全部會產生一些缺失值 這裏直接刪了 數量很少 應該問題不大 ''' return data.dropna()
def process(df): ''' 對時間進行了一點處理 訓練集沒有問題 測試集出現了2023年時間 應該是官方脫敏的緣由 ''' df.DOTTING_TIME/=1000 df.DOTTING_TIME=list(map(lambda x:time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(x)),df.DOTTING_TIME)) df=df.sort_values(['QUEUE_ID','DOTTING_TIME']) df['DOTTING_TIME']=pd.to_datetime(df.DOTTING_TIME) return df
train=pd.read_csv('train.csv') test=pd.read_csv('evaluation_public.csv') train=process(train) test=process(test) #將以後五個時間點的數值做爲label train=train.groupby('QUEUE_ID').apply(make_label)
train=train.reset_index(drop=True) #特徵值轉化 encode_STATUS=LabelEncoder() encode_QUEUE_TYPE=LabelEncoder() encode_PLATFORM=LabelEncoder() encode_RESOURCE_TYPE=LabelEncoder()
train.STATUS=encode_STATUS.fit_transform(train.STATUS) test.STATUS=encode_STATUS.transform(test.STATUS) train.QUEUE_TYPE=encode_QUEUE_TYPE.fit_transform(train.QUEUE_TYPE) test.QUEUE_TYPE=encode_QUEUE_TYPE.transform(test.QUEUE_TYPE) train.PLATFORM=encode_PLATFORM.fit_transform(train.PLATFORM) test.PLATFORM=encode_PLATFORM.transform(test.PLATFORM) train.RESOURCE_TYPE=encode_RESOURCE_TYPE.fit_transform(train.RESOURCE_TYPE) test.RESOURCE_TYPE=encode_RESOURCE_TYPE.transform(test.RESOURCE_TYPE)
train.drop(['DOTTING_TIME'],axis=1,inplace=True) test.drop(['DOTTING_TIME'],axis=1,inplace=True)
targets_names=['CPU_USAGE_1','LAUNCHING_JOB_NUMS_1','CPU_USAGE_2','LAUNCHING_JOB_NUMS_2','CPU_USAGE_3','LAUNCHING_JOB_NUMS_3', 'CPU_USAGE_4','LAUNCHING_JOB_NUMS_4','CPU_USAGE_5','LAUNCHING_JOB_NUMS_5'] targets=[] for i in targets_names: targets.append(train[i]) train.drop(i,axis=1,inplace=True)
''' 只保留測試集最後一條樣本進行預測,測試集利用率仍是比較低 ''' test=test.drop_duplicates(subset=['ID'],keep='last') test_id=test.ID test.drop('ID',axis=1,inplace=True)
from catboost import CatBoostRegressor from lightgbm import LGBMRegressor from sklearn.model_selection import train_test_split df=pd.DataFrame() df['ID']=test_id model=LGBMRegressor(n_estimators=100000,eval_metric='mse') for i in targets: train_x,test_x,train_y,test_y=train_test_split(train,i,test_size=0.2,random_state=42) model.fit(train_x,train_y,eval_set=(test_x,test_y),early_stopping_rounds=50,verbose=100) df[i.name]=model.predict(test,num_iteration=model.best_iteration_)
爲了防止號沒了,改動了一下劃分訓練集測試集的隨機種子,可能還有一些庫的版本差距,差距應該不大,比較慫,但願沒有直接提交的,萬一改了個隨機種子直接上分就有點尷尬(手動狗頭)less