目錄:html
ref: Removing stop words with NLTK in Pythonweb
ref: Remove Stop Wordssession
import nltk # nltk.download('stopwords') from nltk.corpus import stopwords print(stopwords.words('english')) output: ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
2. 介詞(prepositions, part of speech)iphone
ref: How do I remove verbs, prepositions, conjunctions etc from my text? [closed]ide
ref: Alphabetical list of part-of-speech tags used in the Penn Treebank Project:this
>>> import nltk >>> sentence = """At eight o'clock on Thursday morning ... Arthur didn't feel very good.""" >>> tokens = nltk.word_tokenize(sentence) >>> tokens ['At', 'eight', "o'clock", 'on', 'Thursday', 'morning', 'Arthur', 'did', "n't", 'feel', 'very', 'good', '.'] >>> tagged = nltk.pos_tag(tokens) >>> tagged[0:6] [('At', 'IN'), ('eight', 'CD'), ("o'clock", 'JJ'), ('on', 'IN'), ('Thursday', 'NNP'), ('morning', 'NN')]
3. Named Entity Recognition (NER)spa
ref: Introduction to Named Entity Recognitioncode
ref: Named Entity Recognition with NLTK and SpaCyhtm
article = ''' Asian shares skidded on Tuesday after a rout in tech stocks put Wall Street to the sword, while a sharp drop in oil prices and political risks in Europe pushed the dollar to 16-month highs as investors dumped riskier assets. MSCI’s broadest index of Asia-Pacific shares outside Japan dropped 1.7 percent to a 1-1/2 week trough, with Australian shares sinking 1.6 percent. Japan’s Nikkei dived 3.1 percent led by losses in electric machinery makers and suppliers of Apple’s iphone parts. Sterling fell to $1.286 after three straight sessions of losses took it to the lowest since Nov.1 as there were still considerable unresolved issues with the European Union over Brexit, British Prime Minister Theresa May said on Monday.''' import nltk from nltk.tag import StanfordNERTagger print('NTLK Version: %s' % nltk.__version__) stanford_ner_tagger = StanfordNERTagger( r"D:\Twitter Data\Data\NER\stanford-ner-2018-10-16\classifiers\english.muc.7class.distsim.crf.ser.gz", r"D:\Twitter Data\Data\NER\stanford-ner-2018-10-16\stanford-ner-3.9.2.jar" ) results = stanford_ner_tagger.tag(article.split()) print('Original Sentence: %s' % (article)) for result in results: tag_value = result[0] tag_type = result[1] if tag_type != 'O': print('Type: %s, Value: %s' % (tag_type, tag_value)) output: NTLK Version: 3.4 Original Sentence: Asian shares skidded on Tuesday after a rout in tech stocks put Wall Street to the sword, while a sharp drop in oil prices and political risks in Europe pushed the dollar to 16-month highs as investors dumped riskier assets. MSCI’s broadest index of Asia-Pacific shares outside Japan dropped 1.7 percent to a 1-1/2 week trough, with Australian shares sinking 1.6 percent. Japan’s Nikkei dived 3.1 percent led by losses in electric machinery makers and suppliers of Apple’s iphone parts. Sterling fell to $1.286 after three straight sessions of losses took it to the lowest since Nov.1 as there were still considerable unresolved issues with the European Union over Brexit, British Prime Minister Theresa May said on Monday. Type: DATE, Value: Tuesday Type: LOCATION, Value: Europe Type: ORGANIZATION, Value: Asia-Pacific Type: LOCATION, Value: Japan Type: PERCENT, Value: 1.7 Type: PERCENT, Value: percent Type: ORGANIZATION, Value: Nikkei Type: PERCENT, Value: 3.1 Type: PERCENT, Value: percent Type: LOCATION, Value: European Type: LOCATION, Value: Union Type: PERSON, Value: Theresa Type: PERSON, Value: May
import spacy from spacy import displacy from collections import Counter import en_core_web_sm nlp = en_core_web_sm.load() doc = nlp(article) for X in doc.ents: print('Value: %s, Type: %s' % (X.text, X.label_)) output: Value: Asian, Type: NORP Value: Tuesday, Type: DATE Value: Europe, Type: LOC Value: MSCI’s, Type: ORG Value: Asia-Pacific, Type: LOC Value: Japan, Type: GPE Value: 1.7 percent, Type: PERCENT Value: 1-1/2, Type: CARDINAL Value: Australian, Type: NORP Value: 1.6 percent, Type: PERCENT Value: Japan, Type: GPE Value: 3.1 percent, Type: PERCENT Value: Apple, Type: ORG Value: 1.286, Type: MONEY Value: three, Type: CARDINAL Value: Nov.1, Type: NORP Value: the European Union, Type: ORG Value: Brexit, Type: GPE Value: British, Type: NORP Value: Theresa May, Type: PERSON Value: Monday, Type: DATE
def fn_preprocess(art): art = nltk.word_tokenize(art) art = nltk.pos_tag(art) return art art_processed = fn_preprocess(article) print(art_processed) output: [('Asian', 'JJ'), ('shares', 'NNS'), ('skidded', 'VBN'), ('on', 'IN'), ('Tuesday', 'NNP'), ('after', 'IN'), ('a', 'DT'), ('rout', 'NN'), ('in', 'IN'), ('tech', 'JJ'), ('stocks', 'NNS'), ('put', 'VBD'), ('Wall', 'NNP'), ('Street', 'NNP'), ('to', 'TO'), ('the', 'DT'), ('sword', 'NN'), (',', ','), ('while', 'IN'), ('a', 'DT'), ('sharp', 'JJ'), ('drop', 'NN'), ('in', 'IN'), ('oil', 'NN'), ('prices', 'NNS'), ('and', 'CC'), ('political', 'JJ'), ('risks', 'NNS'), ('in', 'IN'), ('Europe', 'NNP'), ('pushed', 'VBD'), ('the', 'DT'), ('dollar', 'NN'), ('to', 'TO'), ('16-month', 'JJ'), ('highs', 'NNS'), ('as', 'IN'), ('investors', 'NNS'), ('dumped', 'VBD'), ('riskier', 'JJR'), ('assets', 'NNS'), ('.', '.'), ('MSCI', 'NNP'), ('’', 'NNP'), ('s', 'VBD'), ('broadest', 'JJS'), ('index', 'NN'), ('of', 'IN'), ('Asia-Pacific', 'NNP'), ('shares', 'NNS'), ('outside', 'IN'), ('Japan', 'NNP'), ('dropped', 'VBD'), ('1.7', 'CD'), ('percent', 'NN'), ('to', 'TO'), ('a', 'DT'), ('1-1/2', 'JJ'), ('week', 'NN'), ('trough', 'NN'), (',', ','), ('with', 'IN'), ('Australian', 'JJ'), ('shares', 'NNS'), ('sinking', 'VBG'), ('1.6', 'CD'), ('percent', 'NN'), ('.', '.'), ('Japan', 'NNP'), ('’', 'NNP'), ('s', 'VBD'), ('Nikkei', 'NNP'), ('dived', 'VBD'), ('3.1', 'CD'), ('percent', 'NN'), ('led', 'VBN'), ('by', 'IN'), ('losses', 'NNS'), ('in', 'IN'), ('electric', 'JJ'), ('machinery', 'NN'), ('makers', 'NNS'), ('and', 'CC'), ('suppliers', 'NNS'), ('of', 'IN'), ('Apple', 'NNP'), ('’', 'NNP'), ('s', 'VBD'), ('iphone', 'NN'), ('parts', 'NNS'), ('.', '.'), ('Sterling', 'NN'), ('fell', 'VBD'), ('to', 'TO'), ('$', '$'), ('1.286', 'CD'), ('after', 'IN'), ('three', 'CD'), ('straight', 'JJ'), ('sessions', 'NNS'), ('of', 'IN'), ('losses', 'NNS'), ('took', 'VBD'), ('it', 'PRP'), ('to', 'TO'), ('the', 'DT'), ('lowest', 'JJS'), ('since', 'IN'), ('Nov.1', 'NNP'), ('as', 'IN'), ('there', 'EX'), ('were', 'VBD'), ('still', 'RB'), ('considerable', 'JJ'), ('unresolved', 'JJ'), ('issues', 'NNS'), ('with', 'IN'), ('the', 'DT'), ('European', 'NNP'), ('Union', 'NNP'), ('over', 'IN'), ('Brexit', 'NNP'), (',', ','), ('British', 'NNP'), ('Prime', 'NNP'), ('Minister', 'NNP'), ('Theresa', 'NNP'), ('May', 'NNP'), ('said', 'VBD'), ('on', 'IN'), ('Monday', 'NNP'), ('.', '.')]
ref: An introduction to Bag of Words and how to code it in Python for NLP
import re def word_extraction(sentence): ignore = ['a', "the", "is"] words = re.sub("[^\w]", " ", sentence).split() cleaned_text = [w.lower() for w in words if w not in ignore] return cleaned_text a = "alex is. good guy." print(word_extraction(a)) output: ['alex', 'good', 'guy']