scikit-learn庫 安裝須要numpy,pandas等庫算法
# Author:song from sklearn.feature_extraction import DictVectorizer def dictvec(): """字典數據抽取""" dict_vec = DictVectorizer(sparse=False) data = dict_vec.fit_transform([{'city':'A市','num':100},{'city':'D市','num':100},{'city':'B市','num':80},{'city':'C市','num':56}]) print(data)#sparse矩陣 print(dict_vec.get_feature_names()) print(dict_vec.inverse_transform(data)) return None if __name__ =="__main__": dictvec() 結果: [[ 1. 0. 0. 0. 100.] [ 0. 0. 0. 1. 100.] [ 0. 1. 0. 0. 80.] [ 0. 0. 1. 0. 56.]] ['city=A市', 'city=B市', 'city=C市', 'city=D市', 'num'] [{'num': 100.0, 'city=A市': 1.0}, {'city=D市': 1.0, 'num': 100.0}, {'city=B市': 1.0, 'num': 80.0}, {'city=C市': 1.0, 'num': 56.0}]
對於中文的特徵值化,須要先分詞處理(下載jieba, jieba.cut(‘文本內容’))orm
from sklearn.feature_extraction.text import CountVectorizer def contentvec(): """字典數據抽取""" con_vec = CountVectorizer() data = con_vec.fit_transform({'If the day is done ,','If birds sing no more .','If the wind has fiagged tired '}) print(data.toarray()) print(con_vec.get_feature_names()) return None if __name__ =="__main__": contentvec() 結果 [[0 1 1 0 0 1 1 0 0 0 1 0 0] [1 0 0 0 0 1 0 1 1 1 0 0 0] [0 0 0 1 1 1 0 0 0 0 1 1 1]] ['birds', 'day', 'done', 'fiagged', 'has', 'if', 'is', 'more', 'no', 'sing', 'the', 'tired', 'wind']
tf - idf主要思想:若是某個詞或短語在一篇文章出現頻率高,而且其餘文章不多出現,則認爲此詞或者短語具備很好的類別區分能力,適合用來分類。 做用是用以評估一字詞對於一個文件集或者一個語料庫中的其中一份文件的重要程度。blog
from sklearn.feature_extraction.text import TfidfVectorizer def tfvec(): """字典數據抽取""" tf_vec = TfidfVectorizer(stop_words=None) data = tf_vec.fit_transform({'If the day is done ,','If birds sing no more .','If the wind has fiagged tired '}) print(data.toarray()) print(tf_vec.get_feature_names()) return None if __name__ =="__main__": tfvec() 結果: [[ 0. 0. 0. 0.45050407 0.45050407 0.26607496 0. 0. 0. 0. 0.34261996 0.45050407 0.45050407] [ 0. 0.50461134 0.50461134 0. 0. 0.29803159 0.50461134 0. 0. 0. 0.38376993 0. 0. ] [ 0.47952794 0. 0. 0. 0. 0.28321692 0. 0.47952794 0.47952794 0.47952794 0. 0. 0. ]] ['birds', 'day', 'done', 'fiagged', 'has', 'if', 'is', 'more', 'no', 'sing', 'the', 'tired', 'wind']
公式: 在特定的場景下最大值和最小值是變化的,另外最大值與最小值很是容易受異常點影響,因此這種方法魯棒性較差。
from sklearn.preprocessing import MinMaxScaler def mm(): mm = MinMaxScaler(feature_range=(0,1)) #參數限定區間 data = mm.fit_transform([[90,2,10,40],[60,5,15,20],[80,3,12,30]]) print(data) return None if __name__=="__main__": mm() 結果 [[ 1. 0. 0. 1. ] [ 0. 1. 1. 0. ] [ 0.66666667 0.33333333 0.4 0.5 ]]
公式: µ爲平均值,σ爲標準差。
from sklearn.preprocessing import StandardScaler def stand(): std = StandardScaler() data = std.fit_transform([[1,-1,3],[2,3,2],[4,5,6]]) print(data) return None if __name__=="__main__": stand() 結果: [[-1.06904497 -1.33630621 -0.39223227] [-0.26726124 0.26726124 -0.98058068] [ 1.33630621 1.06904497 1.37281295]]
from sklearn.feature_selection import VarianceThreshold def var(): var = VarianceThreshold(threshold=0)#刪除方差爲0的 data = var.fit_transform([[0,2,0,3],[0,1,4,3],[0,1,1,3]]) print(data) return None if __name__=="__main__": var() 結果: [[2 0] [1 4] [1 1]]
from sklearn.decomposition import PCA def pca(): pca = PCA(n_components=0.9)#小數表明保留數據百分比,整數表示保留幾個特徵 data = pca.fit_transform([[5,2,0,3],[9,1,4,3],[5,1,1,3]]) print(data) return None if __name__=='__main__': pca() 結果: [[-2.16802239] [ 3.55798483] [-1.38996267]]