from numpy import * import operator def create_data_set(): # 訓練集與標籤 group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]]) labels = ['A', 'A', 'B', 'B'] return group, labels group, labels = create_data_set() def classify0(inX, data_set, labels, k): # inX 待分類向量 data_set訓練集 labels標籤向量 k最相近鄰居的數目 計算距離 # for 循環前步驟用於計算距離 距離公式:d = ((xA - xB)**2 + (yA - yB)**2)**0.5 data_set_size = data_set.shape[0] # 陣列的行數 diff_mat = tile(inX, (data_set_size, 1)) - data_set # 待分類向量 - 訓練集中每行陣列 至關於計xA - xB,yA - yB sq_diff_mat = diff_mat ** 2 # 陣列平方,就是陣列每一個對應數字平方 ,至關於將上一步的差平方(xA - xB)**2 sq_distances = sq_diff_mat.sum(axis=1) # 求和(xA - xB)**2 + (yA - yB)**2 distances = sq_distances ** 0.5 # 開方,獲得距離 ((xA - xB)**2 + (yA - yB)**2)**0.5 sorted_dist_indicies = distances.argsort() # 根據距離從小到大排序排序,顯示爲對應索引 class_count = {} for i in range(k): # 選擇距離最小的k個點 vote_ilabel = labels[sorted_dist_indicies[i]] # 從距離最近的開始取對應的索引,根據標籤[索引]獲得對應標籤 class_count[vote_ilabel] = class_count.get(vote_ilabel, 0) + 1 # 字典中有該標籤,則count+1,沒有就新建 sorted_class_count = sorted(class_count.items(), key=operator.itemgetter(1), reverse=True) # 降序排序 return sorted_class_count
def file2matrix(filename): # 文本記錄轉換爲numpy解析程序 fr = open(filename) array_of_lines = fr.readlines() number_of_lines = len(array_of_lines) # 獲得文件行數 return_mat = zeros((number_of_lines, 3)) # 建立用零填充的矩陣 class_label_vector = [] for index, line in enumerate(array_of_lines): line = line.strip() list_fromline = line.split('\t') return_mat[index, :] = list_fromline[0:3] class_label_vector.append(int(list_fromline[-1])) return return_mat, class_label_vector def autoNorm(data_set): # 數據歸一化(不歸一化處理會使數據值大的對結果的影響遠遠大於其餘值) min_vals = data_set.min(0) # 取列的最小值 max_vals = data_set.max(0) # 取最大值 ranges = max_vals - min_vals m = data_set.shape[0] # 行數 norm_data_set = data_set - tile(min_vals, (m, 1)) norm_data_set = norm_data_set / tile(ranges, (m, 1)) # 特徵值相除 return norm_data_set, ranges, min_vals def datingClassTest(): # 測試,得出錯誤率 ho_ratio = 0.10 dating_data_mat, dating_labels = file2matrix('datingTestSet2.txt') # 讀取文檔生成訓練集和標籤 norm_mat, ranges, min_vals = autoNorm(dating_data_mat) # 進行歸一化,生成①新矩陣,②max-min ③min m = norm_mat.shape[0] # 行數 num_test_vecs = int(m * ho_ratio) error_count = 0 for i in range(num_test_vecs): classifier_result = classify0(norm_mat[i, :], norm_mat[num_test_vecs:m, :], dating_labels[num_test_vecs:m], 4) print('the classifier came back with:%s,the real answer is : %s' % (classifier_result, dating_labels[i])) if (classifier_result != dating_labels[i]): error_count += 1.0 print('the total error rate is :%f' % (error_count / float(num_test_vecs))) def classifyPerson(): # 用戶交互的預測函數 result_list = ['not at all', 'in small doses', 'in large doses'] percent_tats = float(input('玩電子遊戲的時間百分比?')) ff_miles = float(input('每一年的飛行里程?')) ice_cream = float(input('每一年消費的冰淇淋量?')) dating_data_mat,dating_labels = file2matrix('datingTestSet2.txt') # 讀取文檔生成訓練集和標籤 norm_mat, ranges, minvals = autoNorm(dating_data_mat) # 進行歸一化,生成①新矩陣,②max-min ③min in_arr = array([ff_miles,percent_tats, ice_cream]) # 根據用戶輸入創建矩陣 classifier_result = classify0((in_arr-minvals)/ranges,norm_mat,dating_labels,3) print('You will probably like this person:',result_list[classifier_result-1])
# 識別手寫數字def img2vector(filename): # 將圖像轉換成向量 return_vect = zeros((1, 1024)) # 建立用零填充的矩陣 fr = open(filename) for i in range(32): line_str = fr.readline() for j in range(32): return_vect[0, 32 * i + j] = int(line_str[j]) return return_vect def handwritingClassTest(): hw_labels =[] training_file_list = listdir('trainingDigits') # 獲取訓練目錄內容 m = len(training_file_list) # 目錄文件數 training_mat = zeros((m, 1024)) # 用零填充m行 1024列的矩陣 for i in range(m): file_name_str = training_file_list[i] # 取出目錄內的文件名 file_str = file_name_str.split('.')[0] class_num_str = int(file_str.split('_')[0]) # 根據文件名提取出標籤類型 hw_labels.append(class_num_str) training_mat[i,:] = img2vector('trainingDigits\%s' % file_name_str) # 利用上面的函數將該文件轉換爲向量並複製給矩陣 test_file_list = listdir('testDigits') # 獲取測試文件內容 error_count = 0.0 m_test = len(test_file_list) # 獲取測試文件數目 for i in range(m_test): file_name_str = test_file_list[i] file_str = file_name_str.split('.')[0] class_num_str = int(file_str.split('_')[0]) vector_under_test = img2vector('testDigits\%s' % file_name_str) classifier_result = classify0(vector_under_test,training_mat,hw_labels,3) print('the classifier came back with : %s,the real answer is : %s'% (classifier_result,class_num_str)) if classifier_result != class_num_str: error_count += 1.0 print('\n the total number of errors is : %s '% error_count) print('\n the total error rate is : %s' % (error_count/float(m_test)))