NLTK基礎

時間 2019-12-09

標籤 nltk 基礎简体版

原文原文鏈接

Python上著名的⾃然語⾔處理庫

⾃帶語料庫，詞性分類庫
⾃帶分類，分詞，等等功能
強⼤的社區⽀持
還有N多的簡單版wrapper

安裝語料庫

# 方式一
import nltk
nltk.download()
 
showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml

若下載速度慢或因其餘緣由下載失敗

官方下載地址 http://www.nltk.org/nltk_data/¶html

githup下載地址 https://github.com/nltk/nltk_datapython

下載packages文件，重命名爲nltk_data

from nltk import data
data.path.append('D:/python3.6/nltk_data')

功能一覽表

下載語料庫git

# 請下載
nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\fei\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.

nltk自帶語料庫github

# nltk自帶語料庫
from nltk.corpus import brown
brown.categories()
['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']
brown.readme()   # 語料信息描述

print(brown.words()[:10])    # 單詞
print(len(brown.words()))

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of']
1161192

print(brown.sents()[:10])    # 句子
print(brown.sents().__len__())
[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ['The', 'September-October', 'term', 'jury', 'had', 'been', 'charged', 'by', 'Fulton', 'Superior', 'Court', 'Judge', 'Durwood', 'Pye', 'to', 'investigate', 'reports', 'of', 'possible', '``', 'irregularities', "''", 'in', 'the', 'hard-fought', 'primary', 'which', 'was', 'won', 'by', 'Mayor-nominate', 'Ivan', 'Allen', 'Jr.', '.'], ['``', 'Only', 'a', 'relative', 'handful', 'of', 'such', 'reports', 'was', 'received', "''", ',', 'the', 'jury', 'said', ',', '``', 'considering', 'the', 'widespread', 'interest', 'in', 'the', 'election', ',', 'the', 'number', 'of', 'voters', 'and', 'the', 'size', 'of', 'this', 'city', "''", '.'], ['The', 'jury', 'said', 'it', 'did', 'find', 'that', 'many', 'of', "Georgia's", 'registration', 'and', 'election', 'laws', '``', 'are', 'outmoded', 'or', 'inadequate', 'and', 'often', 'ambiguous', "''", '.'], ['It', 'recommended', 'that', 'Fulton', 'legislators', 'act', '``', 'to', 'have', 'these', 'laws', 'studied', 'and', 'revised', 'to', 'the', 'end', 'of', 'modernizing', 'and', 'improving', 'them', "''", '.'], ['The', 'grand', 'jury', 'commented', 'on', 'a', 'number', 'of', 'other', 'topics', ',', 'among', 'them', 'the', 'Atlanta', 'and', 'Fulton', 'County', 'purchasing', 'departments', 'which', 'it', 'said', '``', 'are', 'well', 'operated', 'and', 'follow', 'generally', 'accepted', 'practices', 'which', 'inure', 'to', 'the', 'best', 'interest', 'of', 'both', 'governments', "''", '.'], ['Merger', 'proposed'], ['However', ',', 'the', 'jury', 'said', 'it', 'believes', '``', 'these', 'two', 'offices', 'should', 'be', 'combined', 'to', 'achieve', 'greater', 'efficiency', 'and', 'reduce', 'the', 'cost', 'of', 'administration', "''", '.'], ['The', 'City', 'Purchasing', 'Department', ',', 'the', 'jury', 'said', ',', '``', 'is', 'lacking', 'in', 'experienced', 'clerical', 'personnel', 'as', 'a', 'result', 'of', 'city', 'personnel', 'policies', "''", '.']]
57340

print(brown.tagged_words()[:10])    # 詞性標註
print(brown.tagged_words().__len__())     
[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN')]
1161192

2、文本處理流程

1.preprocess正則表達式
2.tokenize網絡
3.stopwordsapp
4....ide
5.make featuresthis
6.machine learning搜索引擎

1、Tokenize

　　把句子拆成有意義的小部件

import nltk
sentence = 'Never underestimate the heart of a champion '
tokens = nltk.word_tokenize(sentence)
tokens
['Never', 'underestimate', 'the', 'heart', 'of', 'a', 'champion']

　　中文分詞

import jieba
seg_list = jieba.cut("我來到北京清華⼤學", cut_all=True)
print("全模式:", "/ ".join(seg_list))  # 全模式
seg_list = jieba.cut("我來到北京清華⼤學", cut_all=False)
print("精確模式:", "/ ".join(seg_list))  # 精確模式
seg_list = jieba.cut("他來到了⽹易杭研⼤廈") # 默認是精確模式
print('新詞識別:',", ".join(seg_list))
seg_list = jieba.cut_for_search("⼩明碩⼠畢業於中國科學院計算所，後在⽇本京都⼤學深造") 
print('搜索引擎模式：',','.join(seg_list))

全模式: 我/ 來到/ 北京/ 清華/ / / 學
精確模式: 我/ 來到/ 北京/ 清華/ ⼤/ 學
新詞識別: 他, 來到, 了, ⽹, 易, 杭研, ⼤, 廈
搜索引擎模式： ⼩,明碩,⼠,畢業,於,中國,科學,學院,科學院,中國科學院,計算,計算所,，,後,在,⽇,本,京都,⼤,學,深造

社交網絡語言的分詞

例子

# 社交網絡語言的tokenize
from nltk.tokenize import word_tokenize
 
tweet = 'RT @angelababy: love you baby! :D http://ah.love #168cm'
print(word_tokenize(tweet))
['RT', '@', 'angelababy', ':', 'love', 'you', 'baby', '!', ':', 'D', 'http', ':', '//ah.love', '#', '168cm']

　解決方法：正則表達式過濾

import re
emoticons_str = r"""
    (?:
        [:=;] # 眼睛
        [oO\-]? # ⿐⼦
        [D\)\]\(\]/\\OpP] # 嘴
    )"""
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @某⼈
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # 話題標籤
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', 
                   # URLs
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # 數字
    r"(?:[a-z][a-z'\-_]+[a-z])", # 含有 - 和 ‘ 的單詞
    r'(?:[\w_]+)', # 其餘
    r'(?:\S)' # 其餘
]

正則表達式對照表

http://www.regexlab.com/zh/regref.htm ¶

tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
 
def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in 
tokens]
    return tokens
 
tweet = 'RT @angelababy: love you baby! :D http://ah.love #168cm'
print(preprocess(tweet))

['RT', '@angelababy', ':', 'love', 'you', 'baby', '!', ':D', 'http://ah.love', '#168cm']

2、詞型歸一化

紛繁複雜的詞形

Inflection變化: walk => walking => walked
影響詞性
derivation 引伸: nation (noun) => national (adjective) => nationalize (verb)
不影響詞性

Stemming 詞⼲提取：⼀般來講，就是把不影響詞性的inflection的⼩尾巴砍掉

walking 砍ing = walk
walked 砍ed = walk
Lemmatization 詞形歸⼀：把各類類型的詞的變形，都歸爲⼀個形式
went 歸⼀ = go
are 歸⼀ = be

1. 詞幹提取

from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
print(porter_stemmer.stem('maximum'))
print(porter_stemmer.stem('presumably'))
print(porter_stemmer.stem('multiply'))
print(porter_stemmer.stem('provision'))

maximum
presum
multipli
provis

2. 詞型歸一　　

from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
print(wordnet_lemmatizer.lemmatize('dogs'))
print(wordnet_lemmatizer.lemmatize('churches'))
print(wordnet_lemmatizer.lemmatize('aardwolves'))
print(wordnet_lemmatizer.lemmatize('abaci'))
print(wordnet_lemmatizer.lemmatize('hardrock'))

dog
church
aardwolf
abacus
hardrock

　沒有pos tag,，默認是nn

# ⽊有POS Tag，默認是NN 名詞
wordnet_lemmatizer.lemmatize('are')
'are'
wordnet_lemmatizer.lemmatize('is')
'is'

　詞性標註

　　方式一：手動標註

# 加上POS Tag
print(wordnet_lemmatizer.lemmatize('is', pos='v'))
print(wordnet_lemmatizer.lemmatize('are', pos='v'))
'be'
'be'

　　方式二：

import nltk
text = nltk.word_tokenize('what does the fox say')
print(text)
print(nltk.pos_tag(text))
['what', 'does', 'the', 'fox', 'say']
[('what', 'WDT'), ('does', 'VBZ'), ('the', 'DT'), ('fox', 'NNS'), ('say', 'VBP')]

　詞性關係表

3、Stopwords

⼀千個HE有⼀千種指代

⼀千個THE有⼀千種指事

對於注重理解⽂本『意思』的應⽤場景來講

歧義太多

全體stopwords列表 http://www.ranks.nl/stopwords

nltk去除stopwords

⾸先記得在console⾥⾯下載⼀下詞庫
或者nltk.download(‘stopwords’)

from nltk.corpusimportstopwords
#先token⼀把，獲得⼀個word_list
# ...
#而後filter⼀把
filtered_words =
[wordforwordinword_listifwordnot instopwords.words('english')]

4、nltk頻率統計　

import nltk
from nltk import FreqDist
# 作個詞庫先
corpus = 'this is my sentence ' \
           'this is my life ' \
           'this is the day'
# 隨便tokenize⼀下
# 顯然, 正如上⽂提到,
# 這⾥能夠根據須要作任何的preprocessing:
# stopwords, lemma, stemming, etc.
tokens = nltk.word_tokenize(corpus)
print(tokens)
['this', 'is', 'my', 'sentence', 'this', 'is', 'my', 'life', 'this', 'is', 'the', 'day']

# 借⽤NLTK的FreqDist統計⼀下⽂字出現的頻率
fdist = FreqDist(tokens)
# 它就相似於⼀個Dict
# 帶上某個單詞, 能夠看到它在整個⽂章中出現的次數
print(fdist.most_common(50))
for k,v in fdist.items():
    print(k,v)

[('this', 3), ('is', 3), ('my', 2), ('sentence', 1), ('life', 1), ('the', 1), ('day', 1)]
this 3
is 3
my 2
sentence 1
life 1
the 1
day 1

# 好, 此刻, 咱們能夠把最常⽤的50個單詞拿出來
standard_freq_vector = fdist.most_common(50)
size = len(standard_freq_vector)
print(standard_freq_vector)

[('this', 3), ('is', 3), ('my', 2), ('sentence', 1), ('life', 1), ('the', 1), ('day', 1)]

Func: 按照出現頻率⼤⼩, 記錄下每⼀個單詞的位置　　

def position_lookup(v):
    res = {}
    counter = 0
    for word in v:
        res[word[0]] = counter
        counter += 1
    return res
# 把標準的單詞位置記錄下來
standard_position_dict = position_lookup(standard_freq_vector)
print(standard_position_dict)
# 獲得⼀個位置對照表
{'this': 0, 'is': 1, 'my': 2, 'sentence': 3, 'life': 4, 'the': 5, 'day': 6}

　　這時，咱們有個新句子

[1, 1, 0, 0, 0, 0, 0]sentence = 'this is cool'
# 先新建⼀個跟咱們的標準vector一樣⼤⼩的向量
freq_vector = [0] * size
# 簡單的Preprocessing
tokens = nltk.word_tokenize(sentence)
# 對於這個新句⼦⾥的每⼀個單詞
for word in tokens:
    try:
        # 若是在咱們的詞庫⾥出現過
        # 那麼就在"標準位置"上+1
        freq_vector[standard_position_dict[word]] += 1
    except KeyError:
        # 若是是個新詞
        # 就pass掉
        continue
print(freq_vector)
# 第⼀個位置表明 is, 出現了⼀次
# 第⼆個位置表明 this, 出現了⼀次
# 後⾯都⽊有
[1, 1, 0, 0, 0, 0, 0]

　5、nltk實現tf-idf

import nltk
from nltk.text import TextCollection
sents = ['this is sentence one', 'this is sentence two', 'this is sentence three']
sents = [nltk.word_tokenize(sent) for sent in sents]
corpus = TextCollection(sents)

# 直接就能算出tfidf
# (term: ⼀句話中的某個term, text: 這句話)
print(corpus.idf('three'))
print(corpus.tf('four',nltk.word_tokenize('this is a sentence four')))
print(corpus.tf_idf('four',nltk.word_tokenize('this is a sentence four')))

1.0986122886681098
0.2
0.0

　6、svd降維

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
la = np.linalg
words = ['I','like','enjoy','deep','learning','NLP','flying']
X = np.array([[0,2,1,0,0,0,0,0],
              [2,0,0,1,0,1,0,0],
              [1,0,0,0,0,0,1,0],
              [0,1,0,0,1,0,0,0],
              [0,0,0,1,0,0,0,1],
              [0,1,0,0,0,0,0,1],
              [0,0,1,0,0,0,0,1],
              [0,0,0,0,1,1,1,0],
             ])
U,s,Vh = la.svd(X,full_matrices=False)
# print(U,s,Vh)
for i in range(len(words)):
    plt.text(U[i,0],U[i,1],words[i])

plt.xlim(-1,1)
plt.ylim(-1,1)
plt.show()

7、nltk經典應用-情感分析

簡單的情感分析

import nltk
words = nltk.word_tokenize('I am very happy,i love you')
sentiment_dictionary = {}
for line in open('data/AFINN/AFINN-111.txt'):
    word, score = line.split('\t')
    sentiment_dictionary[word] = int(score)
# 把這個打分表記錄在⼀個Dict上之後
# 跑⼀遍整個句⼦，把對應的值相加
total_score = sum(sentiment_dictionary.get(word, 0) for word in words)
# 有值就是Dict中的值，沒有就是0
# 因而你就獲得了⼀個 sentiment score
print(total_score)
6

　　配上ML的情感分析

from nltk.classify import NaiveBayesClassifier
# 隨⼿造點訓練集
s1 = 'this is a good book'
s2 = 'this is a awesome book'
s3 = 'this is a bad book'
s4 = 'this is a terrible book'
def preprocess(s):
    # Func: 句⼦處理
    # 這⾥簡單的⽤了split(), 把句⼦中每一個單詞分開
    # 顯然 還有更多的processing method能夠⽤
    return {word: True for word in s.lower().split()}
    # return⻓這樣:
    # {'this': True, 'is':True, 'a':True, 'good':True, 'book':True}
    # 其中, 前⼀個叫fname, 對應每一個出現的⽂本單詞;
    # 後⼀個叫fval, 指的是每一個⽂本單詞對應的值。
        # 這⾥咱們⽤最簡單的True,來表示,這個詞『出如今當前的句⼦中』的意義。
    # 固然啦, 咱們之後能夠升級這個⽅程, 讓它帶有更加⽜逼的fval, ⽐如 word2vec
    
# 把訓練集給作成標準形式
training_data = [[preprocess(s1), 'pos'],
                 [preprocess(s2), 'pos'],
                 [preprocess(s3), 'neg'],
                 [preprocess(s4), 'neg']]
# 餵給model吃
model = NaiveBayesClassifier.train(training_data)
# 打出結果
print(model.classify(preprocess('this is a good book')))
pos