數據分析~手機價格預測

時間 2019-11-11

標籤數據分析手機價格預測简体版

原文原文鏈接

config.pypython


"""

    文件名:    config.py
    功能：     配置文件

"""
import os

# 指定數據集路徑
dataset_path = './data'

# 結果保存路徑
output_path = './output'
if not os.path.exists(output_path):
    os.makedirs(output_path)

# 使用的特徵列
feat_cols = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep',
             'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time',
             'three_g', 'touch_screen', 'wifi']
# print(len(feat_cols))

# 標籤列
label_col = 'price_range'


=========================================================================

main.py


"""

    文件名:    main.py
    功能：     主程序

    案例：手機價格預測
    任務：使用scikit-learn創建不一樣的機器學習模型進行手機價格等級預測

    數據集來源： https://www.kaggle.com/vikramb/mobile-price-eda-prediction

"""
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties

import seaborn as sns
import time
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

import config

# 解決matplotlib顯示中文問題
# 僅適用於Mac
def get_chinese_font():
    """
          獲取系統中文字體
      """
    return FontProperties(fname='/System/Library/Fonts/PingFang.ttc')

def inspect_dataset(train_data, test_data):
    """
            查看數據集
    """
    print('\n===================== 數據查看 =====================')
    print('訓練數據集有{}條記錄'.format(len(train_data)))
    print('測試數據集有{}條記錄'.format(len(test_data)))

    # 可視化各種別的數量統計圖
    plt.figure(figsize=(10, 5))

    # 訓練集
    ax1=plt.subplot(1,2,1)
    # 工做原理就是對輸入的數據分類，條形圖顯示各個分類的數量
    sns.countplot(x='price_range', data=train_data)

    plt.title('訓練集',fontproperties=get_chinese_font())
    # plt.xticks(rotation='vertical')
    plt.xlabel('價格等級',fontproperties=get_chinese_font())
    plt.ylabel('數量',fontproperties=get_chinese_font())

    #測試集
    plt.subplot(1,2,2,sharey=ax1)
    sns.countplot(x='price_range',data=test_data)

    plt.title('測試集',fontproperties=get_chinese_font())
    plt.xlabel('價格等級',fontproperties=get_chinese_font())
    plt.ylabel('數量',fontproperties=get_chinese_font())

    plt.tight_layout()
    plt.savefig('./inspect_dataset.png')
    plt.show()

def train_model(X_train,y_train,X_test,y_test,param_range, model_name='SVM'):
    """
        model_name: 默認爲svm
            knn, kNN模型，對應參數爲 n_neighbors
            lr, 邏輯迴歸模型，對應參數爲 C
            svm, SVM模型，對應參數爲 C
            dt, 決策樹模型，對應參數爲 max_dpeth

        根據給定的參數訓練模型，並返回
        1. 最優模型
        2. 平均訓練耗時
        3. 準確率
    """
    models = []
    scores = []
    durations = []

    for param in param_range:
        if model_name=='KNN':
            print('訓練KNN（k={}）...'.format(param),end='')
            model=KNeighborsClassifier(n_neighbors=param)
        elif model_name=='LR':
            print('訓練Logistic Regression（C={}）...'.format(param), end='')
            model=LogisticRegression(C=param)
        elif model_name=='SVM':
            print('訓練SVM（C={}）...'.format(param), end='')
            model=SVC(kernel='linear',C=param)
        elif model_name=='DT':
            print('訓練決策樹（max_depth={}）...'.format(param), end='')
            model=DecisionTreeClassifier(max_depth=param)

        start=time.time()
        # 訓練模型
        model.fit(X_train,y_train)
        # 計時
        end=time.time()
        duration=end-start
        print('耗時{:.4f}s'.format(duration), end=', ')

        # 驗證模型
        score=model.score(X_test,y_test)
        print('準確率：{:.3f}'.format(score))

        models.append(model)
        durations.append(duration)
        scores.append(score)

    mean_duration=np.mean(durations)
    print('訓練模型平均耗時{:.4f}s'.format(mean_duration))
    print()

    # 記錄最優模型
    best_idx=np.argmax(scores)
    best_acc=scores[best_idx]
    best_model=models[best_idx]

    return best_model,best_acc,mean_duration


def main():
    """
        主函數
    """
    # 加載數據
    all_data=pd.read_csv(os.path.join(config.dataset_path,'data.csv'))
    train_data,test_data=train_test_split(all_data,test_size=1/3,random_state=10)

    # 數據查看
    inspect_dataset(train_data, test_data)

    # 構建訓練測試數據
    # 特徵處理
    feat_names=config.feat_cols
    X_train=train_data[feat_names].values#多維數組
    print('共有{}維特徵。'.format(X_train.shape[1]))
    X_test=test_data[feat_names].values

    # 標籤處理
    y_train=train_data[config.label_col].values
    y_test=test_data[config.label_col].values

    # 數據建模及驗證
    print('\n===================== 數據建模及驗證 =====================')
    model_name_param_dict={
        'KNN':[5,10,15],
        'LR':[0.01,1,100],
        'SVM':[0.01,1,100],
        'DT':[50,100,150]
    }

    # 比較結果的DataFrame
    results_df=pd.DataFrame(columns=['Accuracy(%)','Time(s)'],index=list(model_name_param_dict.keys()))
    results_df.index.name='Model'
    for model_name,param_range in model_name_param_dict.items():
        base_model, best_acc, mean_duration = train_model(X_train, y_train, X_test, y_test,
                                                 param_range, model_name)
        results_df.loc[model_name, 'Accuracy(%)']=best_acc*100
        results_df.loc[model_name, 'Time(s)']=mean_duration

    results_df.to_csv(os.path.join(config.output_path,'model_coparison.csv'))

    # 模型及結果比較
    print('\n===================== 模型及結果比較 =====================')

    plt.figure(figsize=(10,4))
    ax1=plt.subplot(1,2,1)
    # results_df.plot(y=['Accuracy (%)'],title='Accuracy(%)',kind='bar',ylim=[60,100],ax=ax1,legend=False)
    results_df.plot(y=['Accuracy(%)'],kind='bar',ylim=[60,100],ax=ax1,legend=False)

    plt.title('準確度(%)',fontproperties=get_chinese_font())

    ax2=plt.subplot(1,2,2)
    # results_df.plot(y=['Time (s)'],title='consum_time(s)',kind='bar',ax=ax2,legend=False)
    results_df.plot(y=['Time(s)'],kind='bar',ax=ax2,legend=False)

    plt.title('耗時(s)',fontproperties=get_chinese_font())
    plt.tight_layout()
    plt.savefig('./pred_results.png')
    plt.show()


if __name__ == '__main__':
    main()


=========================================================================

/usr/local/bin/python3.6 /Users/apple/PycharmProjects/xxlec04_pro/main.py數組

===================== 數據查看 =====================
訓練數據集有1333條記錄
測試數據集有667條記錄
共有20維特徵。app

===================== 數據建模及驗證 =====================
訓練KNN（k=5）...耗時0.0026s, 準確率：0.922
訓練KNN（k=10）...耗時0.0009s, 準確率：0.910
訓練KNN（k=15）...耗時0.0010s, 準確率：0.927
訓練模型平均耗時0.0015sdom

訓練Logistic Regression（C=0.01）...耗時0.0599s, 準確率：0.664
訓練Logistic Regression（C=1）...耗時0.1010s, 準確率：0.735
訓練Logistic Regression（C=100）...耗時0.1212s, 準確率：0.790
訓練模型平均耗時0.0940s機器學習

訓練SVM（C=0.01）...耗時0.3104s, 準確率：0.972
訓練SVM（C=1）...耗時13.7091s, 準確率：0.966
訓練SVM（C=100）...耗時36.8891s, 準確率：0.969
訓練模型平均耗時16.9695s函數

訓練決策樹（max_depth=50）...耗時0.0122s, 準確率：0.798
訓練決策樹（max_depth=100）...耗時0.0077s, 準確率：0.796
訓練決策樹（max_depth=150）...耗時0.0076s, 準確率：0.804
訓練模型平均耗時0.0092s學習

===================== 模型及結果比較 =====================測試

Process finished with exit code 0字體