.輸入一個段落,分紅句子(Punkt句子分割器)python
import nltk import nltk.data def splitSentence(paragraph): tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') sentences = tokenizer.tokenize(paragraph) return sentences if __name__ == '__main__': print splitSentence("My name is Tom. I am a boy. I like soccer!")
結果爲['My name is Tom.', 'I am a boy.', 'I like soccer!']
2.輸入一個句子,分紅詞組code
from nltk.tokenize import WordPunctTokenizer def wordtokenizer(sentence): #分段 words = WordPunctTokenizer().tokenize(sentence) return words if __name__ == '__main__': print wordtokenizer("My name is Tom.")
結果爲['My', 'name', 'is', 'Tom', '.']