1、題目名稱python
實現knn分類算法算法
2、題目內容數據結構
原生Python實現knn分類算法,並使用鳶尾花數據集進行測試app
3、算法分析dom
knn算法是最簡單的機器學習算法之一,經過測量不一樣特徵值之間的距離進行分類。其基本思路是:若是一個樣本在特徵空間中的k個最類似(即特徵空間中最近鄰)的樣本中的大多數屬於某一個類別,則該樣本也屬於這個類別。機器學習
本次做業主要模擬實現了knn測試數據與訓練數據之間的距離求解、排序、最鄰近k個元素的篩選。其中,空間距離採用「歐式距離」進行計算,表達式以下:ide
上式中dist[i] 爲測試數據與下標爲i的訓練數據的距離,xt,xi 分別爲測試數據和下標爲i的訓練數據,算法總體流程圖以下:學習
圖 1 knn算法流程圖測試
4、調試截圖優化
調試過程主要的任務是觀察數據結構:Python中的嵌套結構較爲複雜,須要清楚每一步輸出結果的維度和具體數據結構
5、運行結果
本次做業中的輸入爲鳶尾花數據集,輸出爲預測後的鳶尾花類型。最初設想採用散點圖輸出,但原生Python散點圖效果較差,故改成直接字符串輸出,輸出類別便可,得出運行結果
圖 4 原生Python散點圖效果較差
=
圖 5 改成直接字符串輸出類別
6、問題及解決
實現過程當中遇到的主要問題是數據結構的混淆。在knn實現類中,通過屢次列表生成、嵌套,容易形成對數據結構的混淆,從而出現下標維數錯誤等錯誤,解決辦法也很簡單,debug查看數據結構或者直接print輸出每步內容觀察。
圖 6 下標錯誤
7、源代碼
1.knn.py
# !/usr/bin/env python
# -*- encoding: utf-8 -*-
# @Project : machinelearning
# @File : knn.py
# @Author : yanchengxu
# @Contact : yanchengxu1214@outlook.com
# @Time : 2019/10/7 16:14
# @IDE : PyCharm
import numpy as np
import math
class KNNClassifier:
"""
KNN
"""
def __init__(self, k=3):
"""
初始化
X_train 特徵測試集
y_train 標量測試集
res_class 預測結果
:param k: 默認值爲3
"""
self.k = k
self.X_train = []
self.y_train = []
self.result = []
def fit(self, X_train, y_train):
"""
KNN 訓練模型
:param X_train: 訓練集特徵數據
:param y_train: 訓練集目標數據
:return: self
"""
assert X_train.shape[0] == y_train.shape[0], '訓練集特徵與目標值個數不匹配'
assert self.k <= X_train.shape[0], 'K值超出訓練數據範圍'
self.X_train = X_train
self.y_train = y_train
# print('K', self.k)
# print('X.shape', self.X_train.shape)
# print('y.shape', self.y_train.shape)
def get_distance(self, x_test):
"""
計算距離
:param x_test: 測試集
:return: list_dist
"""
list_dist = []
for i in range(len(x_test)):
# x_train 是 X_train 中的每一個座標,只有一個維度
list_dist.append(
[math.sqrt(np.sum(x_train[0] - x_test[i][0]) ** 2 + np.sum(x_train[1] - x_test[i][1]) ** 2) for x_train
in self.X_train])
# print('len of list_dist =', len(list_dist[0]))
return list_dist
def get_k_nearest_dist(self, list_dist):
"""
對距離進行排序
:param list_dist: 測試點距離樣本的距離
:return: list_k_nearest_dist
"""
k = self.k
list_each_dist = []
for i in range(len(list_dist)):
dict_temp = {}
for j in range(len(list_dist[i])):
dict_temp[j] = list_dist[i][j]
list_each_dist.append(dict_temp)
# print('list_each_dist:', list_each_dist)
# print('len of count_mix:', len(list_each_dist))
list_k_nearest_dist = []
for i in range(len(list_each_dist)):
# 鍵值對排序
dict_sorted_dist = dict(sorted(list_each_dist[i].items(), key=lambda x: x[1], reverse=False))
# print('dict_sorted_dist', dict_sorted_dist)
top = 0
dict_knearest_distance = {}
for key in dict_sorted_dist:
dict_knearest_distance[key] = dict_sorted_dist[key]
top += 1
if top == self.k:
break
list_k_nearest_dist.append(dict_knearest_distance)
# print('list_k_nearest_dist:', list_k_nearest_dist)
# 注意縮進!!!
return list_k_nearest_dist
def vote(self, k_nearest_dist):
"""
投票
:param k_nearest_dist: k個最近距離
:return: self
"""
# 全部測試點的topK個標籤
list_all_test = []
for i in range(len(k_nearest_dist)):
# 每一個測試點的topK個標籤
list_each_test = []
for key in k_nearest_dist[i]:
# 數據結構
list_each_test.append(self.y_train[key])
# list_each_test.append(self.y_train[key][0])
list_all_test.append(list_each_test)
# print('list_class2', list_each_test)
# print('list_all_test:', list_all_test)
# 利用set去重->優化速度
set_list_class = []
for i in range(len(list_all_test)):
set_list_class.append(set(list_all_test[i]))
# print('set_list_class', set_list_class)
for i in range(len(set_list_class)):
dict_count = {}
for item in set_list_class[i]:
dict_count.update({item: list_all_test[i].count(item)})
# print('dict_count', dict_count)
# 得到字典dict_count中value最大值對應的key,即爲每一個點的分類結果
each_result = max(dict_count, key=dict_count.get)
# print('each_result', each_result)
self.result.append(each_result)
# print('result:', self.result)
return self.result
def predict(self, X_predict):
"""
預測
:param X_predict: 待測集
:return: self
"""
assert X_predict.shape[1] == self.X_train.shape[1], '特徵數不匹配'
# 獲取待測點與標準點的距離
distances = self.get_distance(X_predict)
# print("distances:", distances)
# 獲取k個最近距離
k_nearest_dist = self.get_k_nearest_dist(distances)
# print("k_nearest_dist:", k_nearest_dist)
# 投票
result = self.vote(k_nearest_dist)
return result
2.test.py
# !/usr/bin/env python
# -*- encoding: utf-8 -*-
# @Project : machinelearning
# @File : test.py
# @Author : yanchengxu
# @Contact : yanchengxu1214@outlook.com
# @Time : 2019/10/7 16:57
# @IDE : PyCharm
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from myknn.knn import KNNClassifier
import numpy as np
import matplotlib.pyplot as plt
# import itertools
# import random
kn = KNNClassifier(3)
# 訓練數據
# X = [[1, 1], [1, 2], [1, 3], [2, 1], [2, 2], [2, 3], [3, 1], [3, 2], [3, 3],
# [6, 6], [6, 7], [6, 8], [7, 6], [7, 7], [7, 8], [8, 6], [8, 7], [8, 8],
# [11, 1], [11, 2], [11, 3], [12, 1], [12, 2], [12, 3], [13, 1], [13, 2], [13, 3]]
#無錫作人流多少錢 http://www.xasgyy.net/
# Y = [['A'], ['A'], ['A'], ['A'], ['A'], ['A'], ['A'], ['A'], ['A'],
# ['B'], ['B'], ['B'], ['B'], ['B'], ['B'], ['B'], ['B'], ['B'],
# ['C'], ['C'], ['C'], ['C'], ['C'], ['C'], ['C'], ['C'], ['C']]
# # 隨機
# random_list = list(itertools.product(range(1, 13), range(1, 8)))
# X = random.sample(random_list, len(Y))
# # print('random_list', X)
# print('shape y:', y_train.shape)
iris_dataset = load_iris()
# test
# print(iris_dataset)
X_train, X_test, y_train, y_test = train_test_split(iris_dataset['data'], iris_dataset['target'], random_state=0)
X = np.asarray(X_train)
Y = np.asarray(y_train)
# print('X:', X)
# print('Y,shape', Y.shape)
# print('Y.type', type(Y))
# 模型訓練
kn.fit(X, Y)
# 數據預測
x_test = [[5, 2.9, 1, 0.2], [6.7, 3.2, 5.2, 2.3], [5.6, 3.1, 4.5, 1.5]]
X_test = np.asarray(x_test)
prediction = kn.predict(X_test)
# 打印預測結果
for i in range(len(prediction)):
print(x_test[i], '->', iris_dataset['target_names'][prediction[i]])
# # 散點圖觀察
# x1 = []
# y1 = []
#
# # 訓練集
# for i in np.asarray(X):
# x1.append(i[0])
# y1.append(i[1])
#
# x2 = []
# y2 = []
# # 測試集
# for i in np.asarray(x_test):
# x2.append(i[0])
# y2.append(i[1])
#
# plt.plot(x1, y1, 'r*')
# plt.plot(x2, y2, 'g+')
# plt.show()