R 基於樸素貝葉斯模型實現手機垃圾短信過濾

時間 2019-11-29

標籤基於樸素貝葉模型實現手機垃圾短信過濾简体版

原文原文鏈接

# 讀取數數據， 查看數據結構 df_raw <- read.csv("sms_spam.csv", stringsAsFactors=F) str(df_raw) length(df_raw$type) # 將數據分爲特徵值矩陣 X 和 類標向量y 兩部分，將 y 換爲因子 X <- df_raw$text y <- factor(df_raw$type) length(y) # 查看類標向量 y 的結構和組成 str(y) table(y) # 安裝和加載文本挖掘包 #install.packages("tm") library(NLP) library(tm) # 建立語料庫 X_corpus <- VCorpus(VectorSource(X)) 
 ######## 1 清洗文本數據 ######## # 1.1 將文本中的字母轉換爲小寫 X_corpus_clean <- tm_map(X_corpus, content_transformer(tolower)) 
# 1.2 去除文本中的數字 X_corpus_clean <- tm_map(X_corpus_clean, removeNumbers) # 1.3 去除文本中的停用詞 X_corpus_clean <- tm_map(X_corpus_clean, removeWords, stopwords()) # 1.4 去除文本中的標點符號 X_corpus_clean <- tm_map(X_corpus_clean, removePunctuation) # 添加包 #install.packages("SnowballC") library(SnowballC) # 1.5 提取文本中每一個單詞的詞幹 X_corpus_clean <- tm_map(X_corpus_clean, stemDocument) # 1.6 刪除額外的空白 X_corpus_clean <- tm_map(X_corpus_clean, stripWhitespace) # 1.7 將文本文檔拆分紅詞語, 建立文檔——單詞矩陣 X_dtm <- DocumentTermMatrix(X_corpus_clean) ############# 2 準備輸入數據 ############# 
# 2.1 劃分訓練數據集和測試數據集 X_dtm_train <- X_dtm[1:4169, ] X_dtm_test <- X_dtm[4170:5559, ] y_train <- y[1:4169] y_test <- y[4170:5559] # 說明：由於原始數據 df_raw 是隨機選取的，因此能夠直直接去前 75% 的數據爲測試數據 # 2.2 檢查樣本分分佈是否偏斜 prop.table(table(y_train)) prop.table(table(y_test)) # 2.3 過濾 DTM， 選取頻繁出現的單詞 X_freq_words <- findFreqTerms(X_dtm_train, 5) # 此處能夠試錯調整，以調節模型的性能
# 過濾 DTM X_dtm_train_freq <- X_dtm_train[, X_freq_words] X_dtm_test_freq <- X_dtm_test[, X_freq_words] # 2.4 將矩陣文本編碼爲數值 # 2.4.1定義一個變量轉換函數 convert_counts <- function(x) { x <- ifelse(x > 0, "Yes", "No") } # 2.4.2 轉換訓練矩陣和測試矩陣 X_train <- apply(X_dtm_train_freq, MARGIN=2, convert_counts) X_test <- apply(X_dtm_test_freq, MARGIN=2, convert_counts) 

############# 3 基於數據訓練模型 ############ # install.packages("e1071")  library(e1071) # 訓練模型, 拉普拉斯估計參數默認爲 0 NB_classifier <- naiveBayes(X_train, y_train) ##############  4 評估模型的性能 ############# # 4.1 對測試集中的樣本進行預測 y_pred <- predict(NB_classifier, X_test) # 比較預測值和真實值 # library(gmodels) CrossTable(x=y_test, y=y_pred, prop.chisq = F, prop.t = F, prop.c = F, dnn = c("actural", "predict"))

模型 NB_classifier 在測試集上進行預測的混淆矩陣爲：數據結構

準確率 = 0.864 + 0.110 = 0.974app

對模型調參函數

################## 5 提升模型的性能 ################## # 5.1 添加拉普拉斯估計，訓練模型 NB_classifier2 <- naiveBayes(x = X_train, y = y_train, laplace = 1) # 5.2 對測試集中的樣本進行預測 y_pred2 <- predict(NB_classifier2, X_test) # 5.3 比較預測值和真實值 CrossTable(x = y_test, y = y_pred2, prop.chisq = F, prop.t = T, prop.c = F, dnn = c("actural", "predict"))