1.安裝kaggle,獲取數據源python
pip install kaggle
將數據下載到目錄/data/data-test/digit_recognize/下linux
cd /data/data-test/digit_recognize/ kaggle competitions download -c digit-recognizer
2.安裝anaconda3做爲python3環境,自帶sklearn,pandas,numpy等經常使用工具包git
3.代碼實現app
import pandas as pd from sklearn.model_selection import cross_val_score from sklearn.neighbors import KNeighborsClassifier import pickle # 文件路徑 project_path = '/data/data-test/digit_recognize/' clf_file = project_path + 'knn.pickle' def get_data_chunk(file_name): # 文件太大分塊讀取文件 9000萬條 reader = pd.read_csv(file_name, iterator=True) loop = True chunk_size = 100000 chunks = [] while loop: try: chunk = reader.get_chunk(chunk_size) chunks.append(chunk) print(len(chunks)) except StopIteration: loop = False print("Iteration is stopped.") res = pd.concat(chunks, ignore_index=True) return res def save_clf(clf_s): clf_f = open(clf_file, 'wb') pickle.dump(clf_s, clf_f) clf_f.close() def get_clf(): clf_f = open(clf_file, 'rb') res = pickle.load(clf_f) return res # 對測試數據集預測結果 def predict(): knn_clf = get_clf() test_data = get_data_chunk(project_path + "test.csv") res_data = knn_clf.predict(test_data) df = pd.DataFrame() df["imageId"] = test_data["imageId"] df["Label"] = res_data df.to_csv(project_path + 'res.csv', index=False) def train(): train_data = get_data_chunk(project_path + "train.csv") print(train_data.info()) print(train_data) train_lable = train_data['label'] x = train_data.drop(columns=['label']) max = 0 max_k = 5 # k取值從5,15用K折交叉驗證算出正確率分數 for k in range(5, 15): clf = KNeighborsClassifier(n_neighbors=k) # cv爲2折 scores = cross_val_score(clf, x, train_lable, cv=2, scoring='accuracy') mean = scores.mean() print(k, mean) if mean > max: max_k = k print("maxK=", max_k) # 用max_k做爲knn參數訓練模型 clf = KNeighborsClassifier(n_neighbors=max_k) clf.fit(x, train_lable) # 存儲模型到pickle文件 save_clf(clf) if __name__ == '__main__': train() predict()