天然語言處理被認爲是計算機科學中的一個難題。人類語言的本質使NLP變得困難。要求使用天然語言傳遞信息的規則對於計算機而言並不容易理解。其中一些規則能夠是高級的和抽象的。例如,當某人使用諷刺性言論傳遞信息時。另外一方面,其中一些規則多是低級的。例如,使用字符「 s」表示多個項目。全面理解人類語言須要同時理解單詞和概念之間的聯繫以傳遞預期的信息。雖然人類能夠輕鬆掌握一種語言,可是天然語言的歧義和不精確的特徵使NLP難以實現。dom
本篇文章主要經過講解NLTK(語言語言處理庫)——來給你們講解 天然語言處理NLP
import nltk.tokenize as tk sent_list = tk.sent_tokenize(text) # 把樣本按句子進行拆分 sent_list:句子列表 word_list = tk.word_tokenize(text) # 把樣本按單詞進行拆分 word_list:單詞列表 # 把樣本按單詞進行拆分 punctTokenizer:分詞器對象 punctTokenizer = tk.WordPunctTokenizer() word_list = punctTokenizer.tokenize(text)
import nltk.tokenize as tk doc = "Are you curious about tokenization? " \ "Let's see how it works! " \ "We need to analyze a couple of sentences " \ "with punctuations to see it in action." print(doc) tokens = tk.sent_tokenize(doc) # 句子分詞 for i, token in enumerate(tokens): print("%2d" % (i + 1), token) # 1 Are you curious about tokenization? # 2 Let's see how it works! # 3 We need to analyze a couple of sentences with punctuations to see it in action. tokens = tk.word_tokenize(doc) # 單詞分詞 for i, token in enumerate(tokens): print("%2d" % (i + 1), token) # 1 Are # 2 you # 3 curious # 4 about # ... # 28 action # 29 . tokenizer = tk.WordPunctTokenizer() # 單詞分詞 tokens = tokenizer.tokenize(doc) for i, token in enumerate(tokens): print("%2d" % (i + 1), token) # 1 Are # 2 you # 3 curious # ... # 27 it # 28 in # 29 action # 30 .
文本樣本中的單詞的 詞性 與 時態 對於語義分析並沒有太大影響,因此須要對單詞進行 詞幹提取。
import nltk.stem.porter as pt import nltk.stem.lancaster as lc import nltk.stem.snowball as sb stemmer = pt.PorterStemmer() # 波特詞幹提取器,偏寬鬆 stemmer = lc.LancasterStemmer() # 朗卡斯特詞幹提取器,偏嚴格 # 思諾博詞幹提取器,偏中庸 stemmer = sb.SnowballStemmer('english') r = stemmer.stem('playing') # 提取單詞playing的詞幹
import nltk.stem.porter as pt import nltk.stem.lancaster as lc import nltk.stem.snowball as sb words = ['table', 'probably', 'wolves', 'playing', 'is', 'dog', 'the', 'beaches', 'grounded', 'dreamt', 'envision'] pt_stemmer = pt.PorterStemmer() # 波特詞幹提取器,偏寬鬆 lc_stemmer = lc.LancasterStemmer() # 朗卡斯特詞幹提取器,偏嚴格 sb_stemmer = sb.SnowballStemmer('english') # 思諾博詞幹提取器,偏中庸 for word in words: pt_stem = pt_stemmer.stem(word) lc_stem = lc_stemmer.stem(word) sb_stem = sb_stemmer.stem(word) print('%8s %8s %8s %8s' % (word, pt_stem, lc_stem, sb_stem)) # table tabl tabl tabl # probably probabl prob probabl # wolves wolv wolv wolv # playing play play play # is is is is # dog dog dog dog # the the the the # beaches beach beach beach # grounded ground ground ground # dreamt dreamt dreamt dreamt # envision envis envid envis
import nltk.stem as ns # 獲取詞性還原器對象 lemmatizer = ns.WordNetLemmatizer() n_lemma = lemmatizer.lemmatize(word, pos='n') # 把單詞word按照名詞進行還原 v_lemma = lemmatizer.lemmatize(word, pos='v') # 把單詞word按照動詞進行還原
import nltk.stem as ns words = ['table', 'probably', 'wolves', 'playing', 'is', 'dog', 'the', 'beaches', 'grounded', 'dreamt', 'envision'] lemmatizer = ns.WordNetLemmatizer() for word in words: n_lemma = lemmatizer.lemmatize(word, pos='n') # 名詞 詞性還原 v_lemma = lemmatizer.lemmatize(word, pos='v') # 動詞 詞性還原 print('%8s %8s %8s' % (word, n_lemma, v_lemma)) # table table table # probably probably probably # wolves wolf wolves # playing playing play # is is be # dog dog dog # the the the # beaches beach beach # grounded grounded ground # dreamt dreamt dream # envision envision envision
The brown dog is running. The black dog is in the black room. Running in the room is forbidden.
1 The brown dog is running
2 The black dog is in the black room
3 Running in the room is forbidden
the | brown | dog | is | running | black | in | room | forbidden |
1 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 |
2 | 0 | 1 | 1 | 0 | 2 | 1 | 1 | 0 |
1 | 0 | 0 | 1 | 1 | 0 | 1 | 1 | 1 |
import sklearn.feature_extraction.text as ft cv = ft.CountVectorizer() # 構建詞袋模型 bow = cv.fit_transform(sentences) # 訓練模型 print(bow.toarray()) # 獲取單詞出現的次數 words = cv.get_feature_names() # 獲取全部特徵名
import nltk.tokenize as tk import sklearn.feature_extraction.text as ft doc = 'The brown dog is running. ' \ 'The black dog is in the black room. ' \ 'Running in the room is forbidden.' # 對doc按照句子進行拆分 sents = tk.sent_tokenize(doc) cv = ft.CountVectorizer() # 構建詞袋模型 bow = cv.fit_transform(sents) # 訓練詞袋模型 print(cv.get_feature_names()) # 獲取全部特徵名 # ['black', 'brown', 'dog', 'forbidden', 'in', 'is', 'room', 'running', 'the'] print(bow.toarray()) # [[0 1 1 0 0 1 0 1 1] # [2 0 1 0 1 1 1 0 2] # [0 0 0 1 1 1 1 1 1]]
詞頻 :一個單詞在一個句子中出現的頻率。詞頻相比單詞的出現次數能夠更加客觀的評估單詞對一句話的語義的貢獻度。詞頻越高,對語義的貢獻度越大。對詞袋矩陣歸一化便可獲得詞頻。
import nltk.tokenize as tk import sklearn.feature_extraction.text as ft import sklearn.preprocessing as sp doc = 'The brown dog is running. The black dog is in the black room. ' \ 'Running in the room is forbidden.' sentences = tk.sent_tokenize(doc) # 經過句子分詞 cv = ft.CountVectorizer() bow = cv.fit_transform(sentences) print(bow.toarray()) # 詞 出現的次數 words = cv.get_feature_names() print(words) # 詞 特徵名 tf = sp.normalize(bow, norm='l1') print(tf) # 詞頻 # [[0. 0.2 0.2 0. 0. 0.2 0. 0.2 0.2] # [0.25 0. 0.125 0. 0.125 0.125 0.125 0. 0.25 ] # [0. 0. 0. 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]]
# 獲取詞袋模型 cv = ft.CountVectorizer() bow = cv.fit_transform(sentences).toarray() # 獲取TF-IDF模型訓練器 tt = ft.TfidfTransformer() tfidf = tt.fit_transform(bow).toarray()
import nltk.tokenize as tk import sklearn.feature_extraction.text as ft import numpy as np doc = 'The brown dog is running. ' \ 'The black dog is in the black room. ' \ 'Running in the room is forbidden.' # 對doc按照句子進行拆分 sents = tk.sent_tokenize(doc) # 構建詞袋模型 cv = ft.CountVectorizer() bow = cv.fit_transform(sents) # TFIDF tt = ft.TfidfTransformer() # 獲取TF-IDF模型訓練器 tfidf = tt.fit_transform(bow) # 訓練 print(np.round(tfidf.toarray(), 2)) # 精確到小數點後兩位 # [[0. 0.59 0.45 0. 0. 0.35 0. 0.45 0.35] # [0.73 0. 0.28 0. 0.28 0.22 0.28 0. 0.43] # [0. 0. 0. 0.54 0.41 0.32 0.41 0.41 0.32]]
import numpy as np import sklearn.datasets as sd import sklearn.feature_extraction.text as ft import sklearn.naive_bayes as nb train = sd.load_files('../machine_learning_date/20news', encoding='latin1', shuffle=True, random_state=7) # train.data: 2968個樣本,每一個樣本都是一篇郵件文檔 print(np.array(train.data).shape) # (2968,) # train.target: 2968個樣本,每一個樣本都是文檔對應的類別 print(np.array(train.target).shape) # (2968,) print(train.target_names) # ['misc.forsale', 'rec.motorcycles', 'rec.sport.baseball', 'sci.crypt', 'sci.space'] cv = ft.CountVectorizer() # 詞袋模型 tt = ft.TfidfTransformer() # 獲取TF-IDF模型訓練器 bow = cv.fit_transform(train.data) # 訓練詞袋模型 tfidf = tt.fit_transform(bow) # 訓練TF-IDF模型訓練器 print(tfidf.shape) # (2968, 40605) model = nb.MultinomialNB() # 建立樸素貝葉斯模型 model.fit(tfidf, train.target) # 訓練樸素貝葉斯模型 # 自定義測試集進行測試 test_data = [ 'The curveballs of right handed pitchers tend to curve to the left', 'Caesar cipher is an ancient form of encryption', 'This two-wheeler is really good on slippery roads'] # 怎麼訓練的,就必須怎麼預測 bow = cv.transform(test_data) tfidf = tt.transform(bow) pred_y = model.predict(tfidf) for sent, index in zip(test_data, pred_y): print(sent, '->', train.target_names[index]) # The curveballs of right handed pitchers tend to curve to the left -> rec.sport.baseball # Caesar cipher is an ancient form of encryption -> sci.crypt # This two-wheeler is really good on slippery roads -> rec.motorcycles
import nltk.corpus as nc import nltk.classify as cf # 讀取語料庫中names文件夾裏的male.txt文件,而且進行分詞 male_names = nc.names.words('male.txt') ''' train_data的格式再也不是樣本矩陣,nltk要求的數據格式以下: [ ({'age': 15, 'score1': 95, 'score2': 95}, 'good'), ({'age': 15, 'score1': 45, 'score2': 55}, 'bad') ] ''' # 基於樸素貝葉斯分類器訓練測試數據 model = cf.NaiveBayesClassifier.train(train_data) # 使用測試數據計算分類器精確度得分(測試數據格式與訓練數據一致) ac = cf.accuracy(model, test_data) # 對具體的某個樣本進行類別劃分 feature = {'age': 15, 'score1': 95, 'score2': 95} gender = model.classify(feature)
import random import nltk.corpus as nc import nltk.classify as cf male_names = nc.names.words('male.txt') female_names = nc.names.words('female.txt') data = [] for male_name in male_names: feature = {'feature': male_name[-2:].lower()} # 取名字後面兩個字母 data.append((feature, 'male')) for female_name in female_names: feature = {'feature': female_name[-2:].lower()} data.append((feature, 'female')) random.seed(7) random.shuffle(data) train_data = data[:int(len(data) / 2)] # 用數據集的前一半做爲 訓練數據 test_data = data[int(len(data) / 2):] # 用數據集的後一半做爲 測試訊據 model = cf.NaiveBayesClassifier.train(train_data) # 樸素貝葉斯分類器 ac = cf.accuracy(model, test_data) names, genders = ['Leonardo', 'Amy', 'Sam', 'Tom', 'Katherine', 'Taylor', 'Susanne'], [] for name in names: feature = {'feature': name[-2:].lower()} gender = model.classify(feature) genders.append(gender) for name, gender in zip(names, genders): print(name, '->', gender) # Leonardo -> male # Amy -> female # Sam -> male # Tom -> male # Katherine -> female # Taylor -> male # Susanne -> female
import nltk.classify as cf import nltk.classify.util as cu ''' train_data的格式再也不是樣本矩陣,nltk要求的數據格式以下: [ ({'How': 1, 'are': 1, 'you': 1}, 'ask'), ({'fine': 1, 'Thanks': 2}, 'answer') ] ''' # 基於樸素貝葉斯分類器訓練測試數據 model = cf.NaiveBayesClassifier.train(train_data) ac = cu.accuracy(model, test_data) print(ac) pred = model.classify(test_data)
import nltk.corpus as nc import nltk.classify as cf import nltk.classify.util as cu # 存儲全部的正向樣本 # pdata: [({單詞:true}, 'pos'),(),()...] pdata = [] # pos文件夾中的每一個文件的路徑 fileids = nc.movie_reviews.fileids('pos') # print(len(fileids)) # 整理全部正面評論單詞,存入pdata列表 for fileid in fileids: sample = {} # words: 把當前文檔分詞處理 words = nc.movie_reviews.words(fileid) for word in words: sample[word] = True pdata.append((sample, 'POSITIVE')) # 整理全部反向樣本,存入ndata列表 ndata = [] fileids = nc.movie_reviews.fileids('neg') for fileid in fileids: sample = {} words = nc.movie_reviews.words(fileid) for word in words: sample[word] = True ndata.append((sample, 'NEGATIVE')) # 拆分測試集與訓練集數量(80%做爲訓練集) pnumb, nnumb = int(0.8 * len(pdata)), int(0.8 * len(ndata)) train_data = pdata[:pnumb] + ndata[:nnumb] test_data = pdata[pnumb:] + ndata[nnumb:] # 基於樸素貝葉斯分類器訓練測試數據 model = cf.NaiveBayesClassifier.train(train_data) ac = cu.accuracy(model, test_data) print(ac) # 模擬業務場景 reviews = [ 'It is an amazing movie.', 'This is a dull movie. I would never recommend it to anyone.', 'The cinematography is pretty great in this movie.', 'The direction was terrible and the story was all over the place.'] for review in reviews: sample = {} words = review.split() for word in words: sample[word] = True pcls = model.classify(sample) print(review, '->', pcls)
通過分詞、單詞清洗、詞幹提取後,基於TF-IDF算法能夠抽取一段文本中的核心主題詞彙,從而判斷出當前文本的主題。屬於無監督學習。gensim模塊提供了主題抽取的經常使用工具 。
import gensim.models.ldamodel as gm import gensim.corpora as gc # 把lines_tokens中出現的單詞都存入gc提供的詞典對象,對每個單詞作編碼。 line_tokens = ['hello', 'world', ...] dic = gc.Dictionary(line_tokens) # 經過字典構建詞袋 bow = dic.doc2bow(line_tokens) # 構建LDA模型 # bow: 詞袋 # num_topics: 分類數 # id2word: 詞典 # passes: 每一個主題保留的最大主題詞個數 model = gm.LdaModel(bow, num_topics=n_topics, id2word=dic, passes=25) # 輸出每一個類別中對類別貢獻最大的4個主題詞 topics = model.print_topics(num_topics=n_topics, num_words=4)
import nltk.tokenize as tk import nltk.corpus as nc import nltk.stem.snowball as sb import gensim.models.ldamodel as gm import gensim.corpora as gc doc = [] with open('../machine_learning_date/topic.txt', 'r') as f: for line in f.readlines(): doc.append(line[:-1]) tokenizer = tk.WordPunctTokenizer() stopwords = nc.stopwords.words('english') signs = [',', '.', '!'] stemmer = sb.SnowballStemmer('english') lines_tokens = [] for line in doc: tokens = tokenizer.tokenize(line.lower()) line_tokens = [] for token in tokens: if token not in stopwords and token not in signs: token = stemmer.stem(token) line_tokens.append(token) lines_tokens.append(line_tokens) # 把lines_tokens中出現的單詞都存入gc提供的詞典對象,對每個單詞作編碼。 dic = gc.Dictionary(lines_tokens) # 遍歷每一行,構建詞袋列表 bow = [] for line_tokens in lines_tokens: row = dic.doc2bow(line_tokens) bow.append(row) n_topics = 2 # 經過詞袋、分類數、詞典、每一個主題保留的最大主題詞個數構建LDA模型 model = gm.LdaModel(bow, num_topics=n_topics, id2word=dic, passes=25) # 輸出每一個類別中對類別貢獻最大的4個主題詞 topics = model.print_topics(num_topics=n_topics, num_words=4) for label, words in topics: print(label, '->', words) # 0 -> 0.022*"cryptographi" + 0.022*"use" + 0.022*"need" + 0.013*"cryptograph" # 1 -> 0.046*"spaghetti" + 0.021*"made" + 0.021*"italian" + 0.015*"19th"