可是對於fit和transform,你們可能仍是有點迷糊。最近又將《Applied Text Analysis WIth Python》讀了一遍(別驚訝,82頁過一遍很快的。以前一直覺得這本書82頁,今天才發現這本書完整版是400多頁。)我主要結合這本書代碼和本身的理解,實現了fit和tranform算法,方便你們更好的理解文本分析特徵抽取。算法
1、scikit庫 代碼實例app
1.1 咱們先看看fit代碼實例函數
corpus = ["Hey hey hey lets go get lunch today :)", "Did you go home?", "Hey!!! I need a favor"] from sklearn.feature_extraction.text import CountVectorizer vectorize = CountVectorizer() #fit學會語料中的全部詞語,構建詞典 vectorize.fit(corpus) #這裏咱們查看下「詞典」,也就是特徵集(11個特徵詞) print(vectorize.get_feature_names()) ['did', 'favor', 'get', 'go', 'hey', 'home', 'lets', 'lunch', 'need', 'today', 'you']
1.2 transform實例
import pandas as pd dtm = vectorize.transform(corpus) colums_name = vectorize.get_feature_names() series = dtm.toarray() print(pd.DataFrame(series, columns = colums_name ))
從上面的dataframe表中,行表明一個文檔,列表明特徵詞。好比第1行,hey列的所對應的單元格值爲3,說明corpus中第一個document(Hey hey hey lets go get lunch today :) 出現了三次hey。測試
2、fit 與 transform算法實現
剔除中止詞,如「a」、「 the」等
2.1 分詞
from nltk.tokenize import word_tokenize word_tokenize("Today is a beatiful day!") ['Today', 'is', 'a', 'beatiful', 'day', '!']
2.2 標點符號判斷
《Applied text analysis with python》一書中判別分詞結果是否爲符號代碼爲
def is_punct(token): return all(unicodedata.category(char).startswith('P') for char in token)
import unicodedata #這裏以「!」作個測試 unicodedata.category('!') Po
print(all([True, False])) print(all([True, True])) False True
2.3 中止詞
def is_stopword(token): stopwords = nltk.corpus.stopwords.words('english') return token.lower() in stopwords
2.4 詞幹化
2.4.1 stem
import nltkdef stem(token): stem = nltk.stem.SnowballStemmer('english') return stem.stem(token)
2.4.2 lemmatize
from nltk.corpus import wordnet as wn from nltk.stem.wordnet import WordNetLemmatizer def lemmatize(token, pos_tag): lemmatizer = WordNetLemmatizer() tag = { 'N': wn.NOUN, 'V': wn.VERB, 'R': wn.ADV, 'J': wn.ADJ}.get(pos_tag[0]) if tag: return lemmatizer.lemmatize(token.lower(), tag) else:return None print(stem('better')) print(lemmatize('better', 'JJ')) better good
2.5 清洗數據
def clean(document): return [lemmatize(token, tag) for (token, tag) in nltk.pos_tag(word_tokenize(document)) if not is_punct(token) and not is_stopword(token)] print(clean('He was a soldier 20 years ago!')) ['soldier', None, 'year', 'ago']
def clean(document): return [lemmatize(token, tag) for (token, tag) in nltk.pos_tag(word_tokenize(document))if not is_punct(token) and not is_stopword(token) and lemmatize(token, tag)] print(clean('He was a soldier 20 years ago!')) ['soldier', 'year', 'ago']
2.6 構建詞典-fit
def fit(X, y=None): vocab = [] for doc in X: for token in clean(doc): if token not in vocab: vocab.append(token) return vocab X = ["The elephant sneezed at the sight of potatoes.Its very interesting thing.\nBut at the sight of potatoes", "Bats can see via echolocation. See the bat sight sneeze!\nBut it is a bats", "Wondering, she opened the door to the studio.\nHaha!good"]
print(fit(X)) ['elephant', 'sneeze', 'sight', 'potatoes.its', 'interesting', 'thing', 'potato', 'bat', 'see', 'echolocation', 'wondering', 'open', 'door', 'studio', 'haha', 'good']
2.7 對待分析文本數據編碼-transform
def transform(documents): vacab = fit(documents) for doc in documents: result = [] tokens = clean(doc) for va in vacab: result.append(tokens.count(va)) yield result documents = ["The elephant sneezed at the sight of potatoes.Its very interesting thing.\nBut at the sight of potatoes", "Bats can see via echolocation. See the bat sight sneeze!\nBut it is a bats", "Wondering, she opened the door to the studio.\nHaha!good"] print(list(transform(documents))) [[1, 1, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 1, 0, 0, 0, 0, 3, 2, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]]
import nltk import unicodedata from collections import defaultdict from nltk.corpus import wordnet as wn from nltk.stem.wordnet import WordNetLemmatizer from nltk.tokenize import word_tokenize class TextExtractFeature(object): def __init__(self, language='english'): self.stopwords = set(nltk.corpus.stopwords.words(language)) self.lemmatizer = WordNetLemmatizer() def is_punct(self, token): return all(unicodedata.category(char).startswith('P') for char in token) def is_stopword(self, token): return token.lower() in self.stopwords def lemmatize(self, token, pos_tag): tag = { 'N': wn.NOUN, 'V': wn.VERB, 'R': wn.ADV, 'J': wn.ADJ}.get(pos_tag[0]) if tag: return self.lemmatizer.lemmatize(token.lower(), tag) else:return None def clean(self, document): return [self.lemmatize(token, tag).lower() for (token, tag) in nltk.pos_tag(word_tokenize(document)) if not self.is_punct(token) and not self.is_stopword(token) and self.lemmatize(token, tag)] def fit(self, X, y=None): self.y = y self.vocab = [] self.feature_names = defaultdict(int) for doc in X: for token in self.clean(doc): if token not in self.vocab: self.feature_names[token] = len(self.vacab) self.vocab.append(token) def get_feature_names(self): return self.feature_names def transform(self, documents): for idx,doc in enumerate(documents): result = [] tokens = self.clean(doc) for va in self.vocab: result.append(tokens.count(va)) if self.y: result.append(self.y[idx]) yield result
documents = [ "The elephant sneezed at the sight of potatoes.Its very interesting thing.\nBut at the sight of potatoes", "Bats can see via echolocation. See the bat sight sneeze!\nBut it is a bats", "Wondering, she opened the door to the studio.\nHaha!good", ] y = [1, 1, 1] tef = TextExtractFeature(language='english') #構建詞典tef.fit(documents, y) #打印詞典映射關係。即特徵詞 print(tef.get_feature_names()) for s in tef.transform(documents): print(s)
defaultdict(<class 'int'>, {'elephant': 0, 'sneeze': 1, 'sight': 2, 'potatoes.its': 3, 'interesting': 4, 'thing': 5, 'potato': 6, 'bats': 7, 'see': 8, 'echolocation': 9, 'bat': 10, 'wondering': 11,'open': 12, 'door': 13, 'studio': 14, 'haha': 15, 'good': 16}) [1, 1, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1] [0, 1, 1, 0, 0, 0, 0, 1, 2, 1, 2, 0, 0, 0, 0, 0, 0, 1] [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]