pip intall nltk==3.4.5
nltk_data 存放了不少語料數據, 包括大量的數據集,本文中就是用到了其中的 positive_tweets 和 negative_tweets 兩個數據集來訓練模型python
安裝方式有兩種, 離線和在線, 推薦【使用離線】, 由於數據量很大, 在線下載一般會失敗git
python交互式命令行中輸入github
import nltk nltk.download()
執行後會彈出下載窗口, 若是不須要全量下載, 選擇對應分類下, 進行點擊下載便可, json
下載成功後會相應提示 installed 或者指定包進行下載, 一樣仍是python交互式命令行輸入數據結構
import nltk
nltk.download('punkt')
主要用到的是packages 文件夾下的內容app
這就是所有的nltk_data 的內容less
下載後須要進行簡單配置dom
一、 將下載的packages 文件夾重命名爲nltk_data測試
二、將重命名後的nltk_data文件夾放置到nltk能夠找到的路徑下, 查看方法爲 : this
>>>from nltk import data >>>data.find('.') FileSystemPathPointer('C:\\Users\\用戶\\AppData\\Roaming\\nltk_data') # 會輸出本地加載路徑, 直接放置在Roaming下便可
或者出現如下輸出, 將nltk_data文件夾放在任意目錄下也能夠
到此, 環境就已經準備好啦~~~
import re import string import random from nltk.corpus import twitter_samples from nltk.tag import pos_tag, pos_tag_sents from nltk.stem.wordnet import WordNetLemmatizer from nltk.corpus import stopwords from nltk import FreqDist
使用的是twitter_samples下的 negative_tweets.json:【5000條帶有負面情感的推文】 和 positive_tweets.json:【5000條帶有正面情感的推文 用於訓練模型】,能夠講壓縮包解壓出來, 看下里面的json 文件的內容
使用代碼獲取推文內容
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json') # 能夠打印看下分析的推文內容
# 分詞 po_fenci_res = fenci(po_file_name)[:2] be_fenci_res = fenci(ne_file_name)[:2] # 數據量比較大, 因此僅取前2條 print('積極分詞結果: {}'.format(po_fenci_res)) print('消極分詞結果: {}'.format(be_fenci_res)) # 積極分詞結果: [['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)'], ['@Lamb2ja', 'Hey', 'James', '!', 'How', 'odd', ':/', 'Please', 'call', 'our', 'Contact', 'Centre', 'on', '02392441234', 'and', 'we', 'will', 'be', 'able', 'to', 'assist', 'you', ':)', 'Many', 'thanks', '!']] # 消極分詞結果: [['hopeless', 'for', 'tmr', ':('], ['Everything', 'in', 'the', 'kids', 'section', 'of', 'IKEA', 'is', 'so', 'cute', '.', 'Shame', "I'm", 'nearly', '19', 'in', '2', 'months', ':(']]
數據規範化包括如下步驟
def cleaned_list_func(evert_tweet): """ 數據預處理 :param evert_tweet: 每條推文 / 每條待分析的英文句子 :return: 處理後的單詞, 一維列表 """ new_text = [] cixing_list = pos_tag(evert_tweet) # [('', 'NN'), ('', 'NNS'), ()] print('每條推文的詞性標註結果:{}'.format(cixing_list)) for word, cixing in cixing_list: word = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:[0-9a-fA-F][0-9a-fA-F]))+', '', word) # 去掉網址的正則規則 word = re.sub('(@[A-Za-z0-9_]+)', '', word) # 去掉人民的規則, 帶有@的部分 if cixing.startswith('NN'): # 將標註的詞性進行判斷, 替換爲英文的標準詞性表示 pos = 'n' elif cixing.startswith('VB'): pos = 'v' else: pos = 'a' lemmatizer = WordNetLemmatizer() # 使用WordNetLemmatizer類下的lemmatize方法進行詞性還原
new_word = lemmatizer.lemmatize(word, pos)
if len(new_word) > 0 and new_word not in string.punctuation and \ new_word.lower() not in stopwords.words('english'):
new_text.append(new_word.lower())
return new_text
# 數據規範化
positive_cleaned_list = []
negative_cleaned_list = []
for i in po_fenci_res:
positive_cleaned = cleaned_list_func(i)
positive_cleaned_list.append(positive_cleaned)
print('處理後的積極推文結果: {}'.format(positive_cleaned_list))
print('原積極數據對比: {}'.format(positive_tweets[:2]))
通過詞性標註的結果爲
# 每條推文的詞性標註結果:[('#FollowFriday', 'JJ'), ('@France_Inte', 'NNP'), ('@PKuchly57', 'NNP'), ('@Milipol_Paris', 'NNP'), ('for', 'IN'), ('being', 'VBG'), ('top', 'JJ'), ('engaged', 'VBN'), ('members', 'NNS'), ('in', 'IN'), ('my', 'PRP$'), ('community', 'NN'), ('this', 'DT'), ('week', 'NN'), (':)', 'NN')]
數據處理後的推文及原數據的對比
# 處理後的積極推文結果: [['#followfriday', 'top', 'engage', 'member', 'community', 'week', ':)'], ['hey', 'james', 'odd', ':/', 'please', 'call', 'contact', 'centre', '02392441234', 'able', 'assist', ':)', 'many', 'thanks']] # 原積極數據對比: ['#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)', '@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!']
def get_tweets_for_model(clean_tokens_list, tag): """ 準備模型數據 :param clean_tokens_list: 處理後的推文 二維列表 :param tag: 標籤類別 :return: 一維列表, 元素是二元元組 """ li = [] for every_tweet in clean_tokens_list: data_dict = dict([token, True] for token in every_tweet) # {'':true,'':true} li.append((data_dict, tag)) return li
# 準備模型數據
po_for_model = get_tweets_for_model(positive_cleaned_list, 'Positive')
ne_for_model = get_tweets_for_model(negative_cleaned_list, 'Negative')
print('爲模型準備的積極數據: {}'.format(po_for_model))
print('爲模型準備的消極數據: {}'.format(ne_for_model))
此時數據結構爲
# 爲模型準備的消極數據: [({'hopeless': True, 'tmr': True, ':(': True}, 'Negative'), ({'everything': True, 'kid': True, 'section': True, 'ikea': True, 'cute': True, 'shame': True, "i'm": True, 'nearly': True, '19': True, '2': True, 'month': True, ':(': True}, 'Negative')]
model_data = po_for_model + ne_for_model random.shuffle(model_data) # 打亂 train_data = model_data[:7000] # 前7000做爲訓練集 test_data = model_data[7000:] # 其他做爲測試機, 測試訓練出來的模型準確度
def train_model(train_data, test_data): """ 訓練及測試模型 :param train_data: 訓練集 :param test_data: 測試集 :return: 訓練後的模型 """ from nltk import classify from nltk import NaiveBayesClassifier model = NaiveBayesClassifier.train(train_data) print('模型準確率爲: {}'.format(classify.accuracy(model, test_data))) print(model.show_most_informative_features(10)) return model
# 訓練及測試模型
model = train_model(train_data, test_data)
def test(model, test_text): """ 使用訓練好的模型預測數據 :param model: :param test_text: 待分析的句子 :return: """ from nltk.tokenize import word_tokenize custom_tokens = cleaned_list_func(word_tokenize(test_text)) result = dict([token, True] for token in custom_tokens) yuce_res = model.classify(result) print('內容: {} 預測結果: {}'.format(test_text, yuce_res))
test_list = [
"I was sad on the day you went away,I'm not the man your heart is missing,that's why you go away I know.",
"My heart is being cut by the knife that is called MISSING YOU. NOthing in the world can destroy me except losing you. My memory of you devours every cell of my blood",
"I will always be there for you.",
'I fuck you fuck your mother fuck your father fuck your family',
"Don't worry when you are not recognized, but strive to be worthy of recognition.",
"The power of imagination makes us infinite.",
"The glow of one warm thought is to me worth more than money."
]
for i in test_list:
test(model, i)
預測結果爲
模型準確率爲: 0.9943333333333333 Most Informative Features sad = True Negati : Positi = 35.1 : 1.0 follower = True Positi : Negati = 20.6 : 1.0 bam = True Positi : Negati = 20.1 : 1.0 arrive = True Positi : Negati = 18.6 : 1.0 x15 = True Negati : Positi = 17.3 : 1.0 blog = True Positi : Negati = 16.7 : 1.0 followed = True Negati : Positi = 15.5 : 1.0 damn = True Negati : Positi = 15.4 : 1.0 top = True Positi : Negati = 15.3 : 1.0 appreciate = True Positi : Negati = 13.9 : 1.0 None 內容: I was sad on the day you went away,I'm not the man your heart is missing,that's why you go away I know. 預測結果: Negative 內容: My heart is being cut by the knife that is called MISSING YOU. NOthing in the world can destroy me except losing you. My memory of you devours every cell of my blood 預測結果: Negative 內容: I will always be there for you. 預測結果: Negative 內容: I fuck you fuck your mother fuck your father fuck your family 預測結果: Negative 內容: Don't worry when you are not recognized, but strive to be worthy of recognition. 預測結果: Positive 內容: The power of imagination makes us infinite. 預測結果: Negative 內容: The glow of one warm thought is to me worth more than money. 預測結果: Positive
因爲訓練集數據量有限, 因此預測結果也不必定徹底準確
import random import re import string from nltk.corpus import twitter_samples from nltk.tag import pos_tag, pos_tag_sents from nltk.stem.wordnet import WordNetLemmatizer from nltk.corpus import stopwords from nltk import FreqDist def fenci(file): return twitter_samples.tokenized(file) def cleaned_list_func(evert_tweet): new_text = [] cixing_list = pos_tag(evert_tweet) for word, cixing in cixing_list: word = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:[0-9a-fA-F][0-9a-fA-F]))+', '', word) word = re.sub('(@[A-Za-z0-9_]+)', '', word) if cixing.startswith('NN'): pos = 'n' elif cixing.startswith('VB'): pos = 'v' else: pos = 'a' lemmatizer = WordNetLemmatizer() new_word = lemmatizer.lemmatize(word, pos) if len(new_word) > 0 and new_word not in string.punctuation and new_word.lower() not in stopwords.words('english'): new_text.append(new_word.lower()) return new_text def get_all_words(clean_tokens_list): for tokens in clean_tokens_list: for token in tokens: yield token def get_tweets_for_model(clean_tokens_list, tag): li = [] for every_tweet in clean_tokens_list: data_dict = dict([token, True] for token in every_tweet) li.append((data_dict, tag)) return li def train_model(train_data, test_data): from nltk import classify from nltk import NaiveBayesClassifier model = NaiveBayesClassifier.train(train_data)return model def test(model, test_text): from nltk.tokenize import word_tokenize custom_tokens = cleaned_list_func(word_tokenize(test_text)) result = dict([token, True] for token in custom_tokens) if __name__ == '__main__': po_file_path = 'positive_tweets.json' ne_file_path = 'negative_tweets.json' positive_tweets = twitter_samples.strings(po_file_path) negative_tweets = twitter_samples.strings(ne_file_path) po_fenci_res = fenci(po_file_path) be_fenci_res = fenci(ne_file_path) positive_cleaned_list = [] negative_cleaned_list = [] for i in po_fenci_res: positive_cleaned = cleaned_list_func(i) positive_cleaned_list.append(positive_cleaned) for j in be_fenci_res: negative_cleaned = cleaned_list_func(j) negative_cleaned_list.append(negative_cleaned) po_for_model = get_tweets_for_model(positive_cleaned_list, 'Positive') ne_for_model = get_tweets_for_model(negative_cleaned_list, 'Negative') model_data = po_for_model + ne_for_model random.shuffle(model_data) train_data = model_data[:7000] test_data = model_data[7000:] model = train_model(train_data, test_data) test_list = [ "I was sad on the day you went away,I'm not the man your heart is missing,that's why you go away I know.", "My heart is being cut by the knife that is called MISSING YOU. NOthing in the world can destroy me except losing you. My memory of you devours every cell of my blood", "I will always be there for you.", 'I fuck you fuck your mother fuck your father fuck your family', "Don't worry when you are not recognized, but strive to be worthy of recognition.", "The power of imagination makes us infinite.", "The glow of one warm thought is to me worth more than money." ] for i in test_list: test(model, i)
以上內容爲簡單分析, 初學,有任何問題歡迎留言討論~~