最近參加了天池的一個機場航空人流量預測大賽,須要用時間序列來預測,所以開始使用python的pandas庫python
發現pandas庫功能的確很強大,所以在這記錄個人pandas學習之路。app
# -*- coding: utf-8 -*- # 統計將來3小時將要起飛的人數 import os import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.preprocessing import MinMaxScaler os.chdir('C:/Users\Administrator/Desktop/competition/1017') df = pd.read_csv('airport_gz_departure_chusai_2ndround.csv',usecols = [2,3]) df = df.dropna(axis = 0) # 刪除含有空值的行 df = df[df.flight_time>df.checkin_time] # 刪除flighttime早於checktime的行 df = df.sort_values(by='flight_time')# 將數據按flight_time排序 df.flight_time = pd.to_datetime(df.flight_time) #轉換數據類型爲Timestamp df.checkin_time = pd.to_datetime(df.checkin_time) df = df[(df.flight_time-df.checkin_time)<pd.Timedelta(hours=12)] #去除間隔時間相差12個小時的,12這個參數須要本身調試 df = df.flight_time dataset = pd.tseries.index.DatetimeIndex(df.values) # 轉換數據類型爲DatetimeIndex times = pd.date_range(start = '2016-09-10 19:00:00',end = '2016-9-25 15:00:00',freq ='10min') contact_nums = [] for time in times: start = np.where(dataset>time)[0] time = time + pd.Timedelta(hours = 3) # 統計當前時間後3小時將要起飛的乘客 end = np.where(dataset<=time)[0] if len(end)==0: contact_nums.append(0) else: contact_nums.append(end[-1]-start[0]+1) df = pd.DataFrame(contact_nums,index = times,columns = ['num']) df.to_csv('C:/Users/Administrator/Desktop/competition/DataProcessing/Person_to_fly.csv',index_label = 'time_back') scaler = MinMaxScaler(feature_range = (0,1)) contact_nums = scaler.fit_transform(np.reshape(np.array(contact_nums),(len(contact_nums),1)).astype('float32')) plt.plot(scaler.inverse_transform(contact_nums)) plt.show()