import numpy as np import scipy as sp import pandas as pd import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split x_train, x_test, y_train, y_test = train_test_split(customer.ix[:,0:customer.columns.size-1], customer.ix[:,customer.columns.size-1], test_size = 0.2) x_train, x_test, y_train, y_test = train_test_split(order.ix[:,0:order.columns.size-1], order.ix[:,order.columns.size-1], test_size = 0.2)
from scipy.stats import pearsonr prr = [] for i in range(order.columns.size-1): frame = pearsonr(order.iloc[:,i], order.iloc[:,order.columns.size-1]) prr.append(frame) result = pd.concat([pd.DataFrame(order.columns.values.tolist()), pd.DataFrame(prr)], axis=1) result.columns = ['Features', 'Pearson', 'Pvalue'] result result.to_csv('result.csv', index = True, header = True)
from scipy.stats import pearsonr prr = [] for i in range(customer.columns.size-1): frame = pearsonr(customer.iloc[:,i], customer.iloc[:,customer.columns.size-1]) prr.append(frame) result = pd.concat([pd.DataFrame(customer.columns.values.tolist()), pd.DataFrame(prr)], axis=1) result.columns = ['Features', 'Pearson', 'Pvalue'] result result.to_csv('result.csv', index = True, header = True)
from sklearn.ensemble import RandomForestRegressor clf = RandomForestRegressor() clf.fit(x_train, y_train) from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(n_jobs=100) clf.fit(x_train, y_train)
from minepy import MINE mic = [] for i in range(customer.columns.size-1): frame = m.compute_score(customer.iloc[:,i], customer.iloc[:,34]) prr.append(frame) result = pd.concat([pd.DataFrame(customer.columns.values.tolist()), pd.DataFrame(prr)], axis=1) result.columns = ['Features', 'Pearson', 'Pvalue'] result.to_csv('result.csv', index = True, header = True)
corr = customer.corr() corr.to_csv('result.csv', index = True, header = True) tar_corr = lambda x: x.corr(x['tar']) cus_call.apply(tar_corr) cus_call.corrwith(cus_call.tar)
係數反映每一個特徵的影響力。越大表示該特徵在分類中起到的做用越大app
importances = pd.DataFrame(sorted(zip(x_train.columns, map(lambda x: round(x, 4), clf.feature_importances_)), reverse=True)) importances.columns = ['Features', 'Importance'] importances.to_csv('result.csv', index = True, header = True)