''' 【課程2.4】 數據連續屬性離散化 連續屬性變換成分類屬性,即連續屬性離散化 在數值的取值範圍內設定若干個離散劃分點,將取值範圍劃分爲一些離散化的區間,最後用不一樣的符號或整數值表明每一個子區間中的數據值 等寬法 / 等頻法 '''
import numpy as np import pandas as pd import matplotlib.pyplot as plt % matplotlib inline
# 等寬法 → 將數據均勻劃分紅n等份,每份的間距相等 # cut方法 ages=[20,22,25,27,21,23,37,31,61,45,41,32] # 有一組人員年齡數據,但願將這些數據劃分爲「18到25」,「26到35」,「36到60」,「60以上」幾個面元 bins = [18,25,35,60,100] cats = pd.cut(ages,bins) print(cats) print(type(cats)) print('-------') # 返回的是一個特殊的Categorical對象 → 一組表示面元名稱的字符串 print(cats.codes, type(cats.codes)) # 0-3對應分組後的四個區間,用代號來註釋數據對應區間,結果爲ndarray print(cats.categories, type(cats.categories)) # 四個區間,結果爲index print(pd.value_counts(cats)) # 按照區間計數 print('-------') # cut結果含有一個表示不一樣分類名稱的層級數組以及一個年齡數據進行標號的代號屬性 print(pd.cut(ages,[18,26,36,61,100],right=False)) print('-------') # 經過right函數修改閉端,默認爲True group_names=['Youth','YoungAdult','MiddleAged','Senior'] print(pd.cut(ages,bins,labels=group_names)) print('-------') # 能夠設置本身的區間名稱,用labels參數 df = pd.DataFrame({'ages':ages}) group_names=['Youth','YoungAdult','MiddleAged','Senior'] s = pd.cut(df['ages'],bins) # 也能夠 pd.cut(df['ages'],5),將數據等分爲5份 df['label'] = s cut_counts = s.value_counts(sort=False) print(df) print(cut_counts) # 對一個Dataframe數據進行離散化,並計算各個區間的數據計數 plt.scatter(df.index,df['ages'],cmap = 'Reds',c = cats.codes) plt.grid() # 用散點圖表示,其中顏色按照codes分類 # 注意codes是來自於Categorical對象
輸出:數組
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]] Length: 12 Categories (4, object): [(18, 25] < (25, 35] < (35, 60] < (60, 100]] <class 'pandas.core.categorical.Categorical'> ------- [0 0 0 1 0 0 2 1 3 2 2 1] <class 'numpy.ndarray'> Index(['(18, 25]', '(25, 35]', '(35, 60]', '(60, 100]'], dtype='object') <class 'pandas.indexes.base.Index'> (18, 25] 5 (35, 60] 3 (25, 35] 3 (60, 100] 1 dtype: int64 ------- [[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)] Length: 12 Categories (4, object): [[18, 26) < [26, 36) < [36, 61) < [61, 100)] ------- [Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult] Length: 12 Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior] ------- ages label 0 20 (18, 25] 1 22 (18, 25] 2 25 (18, 25] 3 27 (25, 35] 4 21 (18, 25] 5 23 (18, 25] 6 37 (35, 60] 7 31 (25, 35] 8 61 (60, 100] 9 45 (35, 60] 10 41 (35, 60] 11 32 (25, 35] (18, 25] 5 (25, 35] 3 (35, 60] 3 (60, 100] 1 Name: ages, dtype: int64
# 等頻法 → 以相同數量的記錄放進每一個區間 # qcut方法 data = np.random.randn(1000) s = pd.Series(data) cats = pd.qcut(s,4) # 按四分位數進行切割,能夠試試 pd.qcut(data,10) print(cats.head()) print(pd.value_counts(cats)) print('------') # qcut → 根據樣本分位數對數據進行面元劃分,獲得大小基本相等的面元,但並不能保證每一個面元含有相同數據個數 # 也能夠設置自定義的分位數(0到1之間的數值,包含端點) → pd.qcut(data1,[0,0.1,0.5,0.9,1]) plt.scatter(s.index,s,cmap = 'Greens',c = pd.qcut(data,4).codes) plt.xlim([0,1000]) plt.grid() # 用散點圖表示,其中顏色按照codes分類 # 注意codes是來自於Categorical對象
輸出:dom
0 (-0.0689, 0.665] 1 [-3.0201, -0.746] 2 (-0.746, -0.0689] 3 (-0.746, -0.0689] 4 (0.665, 2.9] dtype: category Categories (4, object): [[-3.0201, -0.746] < (-0.746, -0.0689] < (-0.0689, 0.665] < (0.665, 2.9]] (0.665, 2.9] 250 (-0.0689, 0.665] 250 (-0.746, -0.0689] 250 [-3.0201, -0.746] 250 dtype: int64 ------