原生Python實現knn算法

時間 2019-11-10

標籤原生 python 實現 knn 算法欄目 Python 简体版

原文原文鏈接

　　1、題目名稱python

　　實現knn分類算法算法

　　2、題目內容數據結構

　　原生Python實現knn分類算法，並使用鳶尾花數據集進行測試app

　　3、算法分析dom

　　knn算法是最簡單的機器學習算法之一，經過測量不一樣特徵值之間的距離進行分類。其基本思路是：若是一個樣本在特徵空間中的k個最類似(即特徵空間中最近鄰)的樣本中的大多數屬於某一個類別，則該樣本也屬於這個類別。機器學習

　　本次做業主要模擬實現了knn測試數據與訓練數據之間的距離求解、排序、最鄰近k個元素的篩選。其中，空間距離採用「歐式距離」進行計算，表達式以下：ide

$dist\left [ i \right ] = \sqrt{\left ( x_{t}-x_{i} \right )^2}$

　　上式中dist[i] 爲測試數據與下標爲i的訓練數據的距離，xt,xi 分別爲測試數據和下標爲i的訓練數據，算法總體流程圖以下：學習

　　圖 1 knn算法流程圖測試

　　4、調試截圖優化

　　調試過程主要的任務是觀察數據結構：Python中的嵌套結構較爲複雜，須要清楚每一步輸出結果的維度和具體數據結構

　　5、運行結果

　　本次做業中的輸入爲鳶尾花數據集，輸出爲預測後的鳶尾花類型。最初設想採用散點圖輸出，但原生Python散點圖效果較差，故改成直接字符串輸出，輸出類別便可，得出運行結果

　　圖 4 原生Python散點圖效果較差

　　圖 5 改成直接字符串輸出類別

　　6、問題及解決

　　實現過程當中遇到的主要問題是數據結構的混淆。在knn實現類中，通過屢次列表生成、嵌套，容易形成對數據結構的混淆，從而出現下標維數錯誤等錯誤，解決辦法也很簡單，debug查看數據結構或者直接print輸出每步內容觀察。

　　圖 6 下標錯誤

　　7、源代碼

　　1.knn.py

　　# !/usr/bin/env python

　　# -*- encoding: utf-8 -*-

　　# @Project : machinelearning

　　# @File : knn.py

　　# @Author : yanchengxu

　　# @Contact : yanchengxu1214@outlook.com

　　# @Time : 2019/10/7 16:14

　　# @IDE : PyCharm

　　import numpy as np

　　import math

　　class KNNClassifier:

　　"""

　　KNN

　　"""

　　def __init__(self, k=3):

　　"""

　　初始化

　　X_train 特徵測試集

　　y_train 標量測試集

　　res_class 預測結果

　　:param k: 默認值爲3

　　"""

　　self.k = k

　　self.X_train = []

　　self.y_train = []

　　self.result = []

　　def fit(self, X_train, y_train):

　　"""

　　KNN 訓練模型

　　:param X_train: 訓練集特徵數據

　　:param y_train: 訓練集目標數據

　　:return: self

　　"""

　　assert X_train.shape[0] == y_train.shape[0], '訓練集特徵與目標值個數不匹配'

　　assert self.k <= X_train.shape[0], 'K值超出訓練數據範圍'

　　self.X_train = X_train

　　self.y_train = y_train

　　# print('K', self.k)

　　# print('X.shape', self.X_train.shape)

　　# print('y.shape', self.y_train.shape)

　　def get_distance(self, x_test):

　　"""

　　計算距離

　　:param x_test: 測試集

　　:return: list_dist

　　"""

　　list_dist = []

　　for i in range(len(x_test)):

　　# x_train 是 X_train 中的每一個座標，只有一個維度

　　list_dist.append(

　　[math.sqrt(np.sum(x_train[0] - x_test[i][0]) ** 2 + np.sum(x_train[1] - x_test[i][1]) ** 2) for x_train

　　in self.X_train])

　　# print('len of list_dist =', len(list_dist[0]))

　　return list_dist

　　def get_k_nearest_dist(self, list_dist):

　　"""

　　對距離進行排序

　　:param list_dist: 測試點距離樣本的距離

　　:return: list_k_nearest_dist

　　"""

　　k = self.k

　　list_each_dist = []

　　for i in range(len(list_dist)):

　　dict_temp = {}

　　for j in range(len(list_dist[i])):

　　dict_temp[j] = list_dist[i][j]

　　list_each_dist.append(dict_temp)

　　# print('list_each_dist:', list_each_dist)

　　# print('len of count_mix:', len(list_each_dist))

　　list_k_nearest_dist = []

　　for i in range(len(list_each_dist)):

　　# 鍵值對排序

　　dict_sorted_dist = dict(sorted(list_each_dist[i].items(), key=lambda x: x[1], reverse=False))

　　# print('dict_sorted_dist', dict_sorted_dist)

　　top = 0

　　dict_knearest_distance = {}

　　for key in dict_sorted_dist:

　　dict_knearest_distance[key] = dict_sorted_dist[key]

　　top += 1

　　if top == self.k:

　　break

　　list_k_nearest_dist.append(dict_knearest_distance)

　　# print('list_k_nearest_dist:', list_k_nearest_dist)

　　# 注意縮進!!!

　　return list_k_nearest_dist

　　def vote(self, k_nearest_dist):

　　"""

　　投票

　　:param k_nearest_dist: k個最近距離

　　:return: self

　　"""

　　# 全部測試點的topK個標籤

　　list_all_test = []

　　for i in range(len(k_nearest_dist)):

　　# 每一個測試點的topK個標籤

　　list_each_test = []

　　for key in k_nearest_dist[i]:

　　# 數據結構

　　list_each_test.append(self.y_train[key])

　　# list_each_test.append(self.y_train[key][0])

　　list_all_test.append(list_each_test)

　　# print('list_class2', list_each_test)

　　# print('list_all_test：', list_all_test)

　　# 利用set去重->優化速度

　　set_list_class = []

　　for i in range(len(list_all_test)):

　　set_list_class.append(set(list_all_test[i]))

　　# print('set_list_class', set_list_class)

　　for i in range(len(set_list_class)):

　　dict_count = {}

　　for item in set_list_class[i]:

　　dict_count.update({item: list_all_test[i].count(item)})

　　# print('dict_count', dict_count)

　　# 得到字典dict_count中value最大值對應的key，即爲每一個點的分類結果

　　each_result = max(dict_count, key=dict_count.get)

　　# print('each_result', each_result)

　　self.result.append(each_result)

　　# print('result:', self.result)

　　return self.result

　　def predict(self, X_predict):

　　"""

　　預測

　　:param X_predict: 待測集

　　:return: self

　　"""

　　assert X_predict.shape[1] == self.X_train.shape[1], '特徵數不匹配'

　　# 獲取待測點與標準點的距離

　　distances = self.get_distance(X_predict)

　　# print("distances：", distances)

　　# 獲取k個最近距離

　　k_nearest_dist = self.get_k_nearest_dist(distances)

　　# print("k_nearest_dist：", k_nearest_dist)

　　# 投票

　　result = self.vote(k_nearest_dist)

　　return result

　　2.test.py

　　# !/usr/bin/env python

　　# -*- encoding: utf-8 -*-

　　# @Project : machinelearning

　　# @File : test.py

　　# @Author : yanchengxu

　　# @Contact : yanchengxu1214@outlook.com

　　# @Time : 2019/10/7 16:57

　　# @IDE : PyCharm

　　from sklearn.datasets import load_iris

　　from sklearn.model_selection import train_test_split

　　from myknn.knn import KNNClassifier

　　import numpy as np

　　import matplotlib.pyplot as plt

　　# import itertools

　　# import random

　　kn = KNNClassifier(3)

　　# 訓練數據

　　# X = [[1, 1], [1, 2], [1, 3], [2, 1], [2, 2], [2, 3], [3, 1], [3, 2], [3, 3],

　　# [6, 6], [6, 7], [6, 8], [7, 6], [7, 7], [7, 8], [8, 6], [8, 7], [8, 8],

　　# [11, 1], [11, 2], [11, 3], [12, 1], [12, 2], [12, 3], [13, 1], [13, 2], [13, 3]]

　　#無錫作人流多少錢 http://www.xasgyy.net/

　　# Y = [['A'], ['A'], ['A'], ['A'], ['A'], ['A'], ['A'], ['A'], ['A'],

　　# ['B'], ['B'], ['B'], ['B'], ['B'], ['B'], ['B'], ['B'], ['B'],

　　# ['C'], ['C'], ['C'], ['C'], ['C'], ['C'], ['C'], ['C'], ['C']]

　　# # 隨機

　　# random_list = list(itertools.product(range(1, 13), range(1, 8)))

　　# X = random.sample(random_list, len(Y))

　　# # print('random_list', X)

　　# print('shape y:', y_train.shape)

　　iris_dataset = load_iris()

　　# test

　　# print(iris_dataset)

　　X_train, X_test, y_train, y_test = train_test_split(iris_dataset['data'], iris_dataset['target'], random_state=0)

　　X = np.asarray(X_train)

　　Y = np.asarray(y_train)

　　# print('X:', X)

　　# print('Y,shape', Y.shape)

　　# print('Y.type', type(Y))

　　# 模型訓練

　　kn.fit(X, Y)

　　# 數據預測

　　x_test = [[5, 2.9, 1, 0.2], [6.7, 3.2, 5.2, 2.3], [5.6, 3.1, 4.5, 1.5]]

　　X_test = np.asarray(x_test)

　　prediction = kn.predict(X_test)

　　# 打印預測結果

　　for i in range(len(prediction)):

　　print(x_test[i], '->', iris_dataset['target_names'][prediction[i]])

　　# # 散點圖觀察

　　# x1 = []

　　# y1 = []

　　# # 訓練集

　　# for i in np.asarray(X):

　　# x1.append(i[0])

　　# y1.append(i[1])

　　# x2 = []

　　# y2 = []

　　# # 測試集

　　# for i in np.asarray(x_test):

　　# x2.append(i[0])

　　# y2.append(i[1])

　　# plt.plot(x1, y1, 'r*')

　　# plt.plot(x2, y2, 'g+')

　　# plt.show()

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。