1. 歸納數據:python
1 from urllib.request import urlopen 2 from bs4 import BeautifulSoup 3 import re 4 import string 5 import operator 6 7 def cleanInput(input): 8 input = re.sub('\n+', " ", input).lower() 9 input = re.sub('\[[0-9]*\]', "", input) 10 input = re.sub(' +', " ", input) 11 input = bytes(input, "UTF-8") 12 input = input.decode("ascii", "ignore") 13 cleanInput = [] 14 input = input.split(' ') 15 for item in input: 16 item = item.strip(string.punctuation) 17 if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'): 18 cleanInput.append(item) 19 return cleanInput 20 21 def ngrams(input, n): 22 input = cleanInput(input) 23 output = {} 24 for i in range(len(input)-n+1): 25 ngramTemp = " ".join(input[i:i+n]) 26 if ngramTemp not in output: 27 output[ngramTemp] = 0 28 output[ngramTemp] += 1 29 return output 30 content = str( 31 urlopen("http://pythonscraping.com/files/inaugurationSpeech.txt").read(), 32 'utf-8') 33 ngrams = ngrams(content, 2) 34 sortedNGrams = sorted(ngrams.items(), key = operator.itemgetter(1), reverse=True) 35 print(sortedNGrams) 36 37 def isCommon(ngram): 38 commonWords = ["the", "be", "and", "of", "a", "in", "to", "have", "it", 39 "i", "that", "for", "you", "he", "with", "on", "do", "say", "this", 40 "they", "is", "an", "at", "but","we", "his", "from", "that", "not", 41 "by", "she", "or", "as", "what", "go", "their","can", "who", "get", 42 "if", "would", "her", "all", "my", "make", "about", "know", "will", 43 "as", "up", "one", "time", "has", "been", "there", "year", "so", 44 "think", "when", "which", "them", "some", "me", "people", "take", 45 "out", "into", "just", "see", "him", "your", "come", "could", "now", 46 "than", "like", "other", "how", "then", "its", "our", "two", "more", 47 "these", "want", "way", "look", "first", "also", "new", "because", 48 "day", "more", "use", "no", "man", "find", "here", "thing", "give", 49 "many", "well"] 50 for word in ngram: 51 if word in commonWords: 52 return True 53 return False
2. 馬爾可夫模型:算法
對一個天氣系統創建馬爾可夫模型:api
在這個天氣系統模型中,若是今天是晴天,那麼明天有 70% 的多是晴天,20% 的可能 多雲,10% 的可能下雨。若是今天是下雨天,那麼明天有 50% 的可能也下雨,25% 的可 能是晴天,25% 的多是多雲。app
3. 廣度優先搜索算法:工具
廣度優先搜索算法的思路是優先搜尋直接鏈接到起始頁的全部連接(而不是找到一個連接 就縱向深刻搜索)。若是這些連接不包含目標頁面(你想要找的詞條),就對第二層的鏈 接——鏈接到起始頁的頁面的全部連接——進行搜索。這個過程不斷重複,直到達到搜索 深度限制或者找到目標頁面爲止。this
4.天然語言工具包:url
天然語言工具包(Natural Language Toolkit,NLTK)就是這樣一個 Python 庫,用於識別和 標記英語文本中各個詞的詞性(parts of speech)。spa