kaggle-Digit Recognizer

  • 安裝kaggle工具獲取數據源(linux 環境)
  • 採用sklearn的KNeighborsClassifier訓練數據
  • 經過K折交叉驗證來選取K值是正確率更高

1.安裝kaggle,獲取數據源python

pip install kaggle

將數據下載到目錄/data/data-test/digit_recognize/下linux

cd /data/data-test/digit_recognize/
kaggle competitions download -c digit-recognizer

2.安裝anaconda3做爲python3環境,自帶sklearn,pandas,numpy等經常使用工具包git

3.代碼實現app

import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
import pickle


# 文件路徑
project_path = '/data/data-test/digit_recognize/'
clf_file = project_path + 'knn.pickle'


def get_data_chunk(file_name):
    # 文件太大分塊讀取文件 9000萬條
    reader = pd.read_csv(file_name, iterator=True)
    loop = True
    chunk_size = 100000
    chunks = []
    while loop:
        try:
            chunk = reader.get_chunk(chunk_size)
            chunks.append(chunk)
            print(len(chunks))
        except StopIteration:
            loop = False
            print("Iteration is stopped.")
    res = pd.concat(chunks, ignore_index=True)
    return res


def save_clf(clf_s):
    clf_f = open(clf_file, 'wb')
    pickle.dump(clf_s, clf_f)
    clf_f.close()


def get_clf():
    clf_f = open(clf_file, 'rb')
    res = pickle.load(clf_f)
    return res

# 對測試數據集預測結果
def predict():
    knn_clf = get_clf()
    test_data = get_data_chunk(project_path + "test.csv")
    res_data = knn_clf.predict(test_data)
    df = pd.DataFrame()
    df["imageId"] = test_data["imageId"]
    df["Label"] = res_data
    df.to_csv(project_path + 'res.csv', index=False)


def train():
    train_data = get_data_chunk(project_path + "train.csv")
    print(train_data.info())
    print(train_data)
    train_lable = train_data['label']
    x = train_data.drop(columns=['label'])

    max = 0
    max_k = 5

    # k取值從5,15用K折交叉驗證算出正確率分數
    for k in range(5, 15):
        clf = KNeighborsClassifier(n_neighbors=k)
        # cv爲2折
        scores = cross_val_score(clf, x, train_lable, cv=2, scoring='accuracy')
        mean = scores.mean()
        print(k, mean)
        if mean > max:
            max_k = k
    print("maxK=", max_k)
    # 用max_k做爲knn參數訓練模型
    clf = KNeighborsClassifier(n_neighbors=max_k)
    clf.fit(x, train_lable)
    # 存儲模型到pickle文件
    save_clf(clf)
    
if __name__ == '__main__':
    train()
    predict()
相關文章
相關標籤/搜索