數據分析~手機價格預測

config.pypython


"""

文件名: config.py
功能: 配置文件

"""
import os

# 指定數據集路徑
dataset_path = './data'

# 結果保存路徑
output_path = './output'
if not os.path.exists(output_path):
os.makedirs(output_path)

# 使用的特徵列
feat_cols = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep',
'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time',
'three_g', 'touch_screen', 'wifi']
# print(len(feat_cols))

# 標籤列
label_col = 'price_range'


=========================================================================

main.py


"""

文件名: main.py
功能: 主程序

案例:手機價格預測
任務:使用scikit-learn創建不一樣的機器學習模型進行手機價格等級預測

數據集來源: https://www.kaggle.com/vikramb/mobile-price-eda-prediction

"""
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties

import seaborn as sns
import time
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

import config

# 解決matplotlib顯示中文問題
# 僅適用於Mac
def get_chinese_font():
"""
獲取系統中文字體
"""
return FontProperties(fname='/System/Library/Fonts/PingFang.ttc')

def inspect_dataset(train_data, test_data):
"""
查看數據集
"""
print('\n===================== 數據查看 =====================')
print('訓練數據集有{}條記錄'.format(len(train_data)))
print('測試數據集有{}條記錄'.format(len(test_data)))

# 可視化各種別的數量統計圖
plt.figure(figsize=(10, 5))

# 訓練集
ax1=plt.subplot(1,2,1)
# 工做原理就是對輸入的數據分類,條形圖顯示各個分類的數量
sns.countplot(x='price_range', data=train_data)

plt.title('訓練集',fontproperties=get_chinese_font())
# plt.xticks(rotation='vertical')
plt.xlabel('價格等級',fontproperties=get_chinese_font())
plt.ylabel('數量',fontproperties=get_chinese_font())

#測試集
plt.subplot(1,2,2,sharey=ax1)
sns.countplot(x='price_range',data=test_data)

plt.title('測試集',fontproperties=get_chinese_font())
plt.xlabel('價格等級',fontproperties=get_chinese_font())
plt.ylabel('數量',fontproperties=get_chinese_font())

plt.tight_layout()
plt.savefig('./inspect_dataset.png')
plt.show()

def train_model(X_train,y_train,X_test,y_test,param_range, model_name='SVM'):
"""
model_name: 默認爲svm
knn, kNN模型,對應參數爲 n_neighbors
lr, 邏輯迴歸模型,對應參數爲 C
svm, SVM模型,對應參數爲 C
dt, 決策樹模型,對應參數爲 max_dpeth

根據給定的參數訓練模型,並返回
1. 最優模型
2. 平均訓練耗時
3. 準確率
"""
models = []
scores = []
durations = []

for param in param_range:
if model_name=='KNN':
print('訓練KNN(k={})...'.format(param),end='')
model=KNeighborsClassifier(n_neighbors=param)
elif model_name=='LR':
print('訓練Logistic Regression(C={})...'.format(param), end='')
model=LogisticRegression(C=param)
elif model_name=='SVM':
print('訓練SVM(C={})...'.format(param), end='')
model=SVC(kernel='linear',C=param)
elif model_name=='DT':
print('訓練決策樹(max_depth={})...'.format(param), end='')
model=DecisionTreeClassifier(max_depth=param)

start=time.time()
# 訓練模型
model.fit(X_train,y_train)
# 計時
end=time.time()
duration=end-start
print('耗時{:.4f}s'.format(duration), end=', ')

# 驗證模型
score=model.score(X_test,y_test)
print('準確率:{:.3f}'.format(score))

models.append(model)
durations.append(duration)
scores.append(score)

mean_duration=np.mean(durations)
print('訓練模型平均耗時{:.4f}s'.format(mean_duration))
print()

# 記錄最優模型
best_idx=np.argmax(scores)
best_acc=scores[best_idx]
best_model=models[best_idx]

return best_model,best_acc,mean_duration


def main():
"""
主函數
"""
# 加載數據
all_data=pd.read_csv(os.path.join(config.dataset_path,'data.csv'))
train_data,test_data=train_test_split(all_data,test_size=1/3,random_state=10)

# 數據查看
inspect_dataset(train_data, test_data)

# 構建訓練測試數據
# 特徵處理
feat_names=config.feat_cols
X_train=train_data[feat_names].values#多維數組
print('共有{}維特徵。'.format(X_train.shape[1]))
X_test=test_data[feat_names].values

# 標籤處理
y_train=train_data[config.label_col].values
y_test=test_data[config.label_col].values

# 數據建模及驗證
print('\n===================== 數據建模及驗證 =====================')
model_name_param_dict={
'KNN':[5,10,15],
'LR':[0.01,1,100],
'SVM':[0.01,1,100],
'DT':[50,100,150]
}

# 比較結果的DataFrame
results_df=pd.DataFrame(columns=['Accuracy(%)','Time(s)'],index=list(model_name_param_dict.keys()))
results_df.index.name='Model'
for model_name,param_range in model_name_param_dict.items():
base_model, best_acc, mean_duration = train_model(X_train, y_train, X_test, y_test,
param_range, model_name)
results_df.loc[model_name, 'Accuracy(%)']=best_acc*100
results_df.loc[model_name, 'Time(s)']=mean_duration

results_df.to_csv(os.path.join(config.output_path,'model_coparison.csv'))

# 模型及結果比較
print('\n===================== 模型及結果比較 =====================')

plt.figure(figsize=(10,4))
ax1=plt.subplot(1,2,1)
# results_df.plot(y=['Accuracy (%)'],title='Accuracy(%)',kind='bar',ylim=[60,100],ax=ax1,legend=False)
results_df.plot(y=['Accuracy(%)'],kind='bar',ylim=[60,100],ax=ax1,legend=False)

plt.title('準確度(%)',fontproperties=get_chinese_font())

ax2=plt.subplot(1,2,2)
# results_df.plot(y=['Time (s)'],title='consum_time(s)',kind='bar',ax=ax2,legend=False)
results_df.plot(y=['Time(s)'],kind='bar',ax=ax2,legend=False)

plt.title('耗時(s)',fontproperties=get_chinese_font())
plt.tight_layout()
plt.savefig('./pred_results.png')
plt.show()


if __name__ == '__main__':
main()


=========================================================================
 

/usr/local/bin/python3.6 /Users/apple/PycharmProjects/xxlec04_pro/main.py數組

 

===================== 數據查看 =====================
訓練數據集有1333條記錄
測試數據集有667條記錄
共有20維特徵。app

 

===================== 數據建模及驗證 =====================
訓練KNN(k=5)...耗時0.0026s, 準確率:0.922
訓練KNN(k=10)...耗時0.0009s, 準確率:0.910
訓練KNN(k=15)...耗時0.0010s, 準確率:0.927
訓練模型平均耗時0.0015sdom

 

訓練Logistic Regression(C=0.01)...耗時0.0599s, 準確率:0.664
訓練Logistic Regression(C=1)...耗時0.1010s, 準確率:0.735
訓練Logistic Regression(C=100)...耗時0.1212s, 準確率:0.790
訓練模型平均耗時0.0940s機器學習

 

訓練SVM(C=0.01)...耗時0.3104s, 準確率:0.972
訓練SVM(C=1)...耗時13.7091s, 準確率:0.966
訓練SVM(C=100)...耗時36.8891s, 準確率:0.969
訓練模型平均耗時16.9695s函數

 

訓練決策樹(max_depth=50)...耗時0.0122s, 準確率:0.798
訓練決策樹(max_depth=100)...耗時0.0077s, 準確率:0.796
訓練決策樹(max_depth=150)...耗時0.0076s, 準確率:0.804
訓練模型平均耗時0.0092s學習

 


===================== 模型及結果比較 =====================測試

 

Process finished with exit code 0字體

 

相關文章
相關標籤/搜索