如要檢索「布爾檢索」或「機率檢索」但不包括「向量檢索」方面的文檔,其相應的查詢表達式爲:Q=檢索 and (布爾or 機率 not向量),那麼Q能夠在其相應的(檢索,布爾,機率,向量)標引詞向量上取(1,1,0,0)(1,0,1,0)(1,1,1,0),那麼文檔Dj的向量若是與這中間一個相等,那麼便可認爲他們之間存在類似關係,而這種相互關係也是布爾值,即sim(Q,Dj)只能爲0或1。python
在某個一共有一千詞的網頁中「原子能」、「的」和「應用」分別出現了 2 次、35 次 和 5 次,那麼它們的詞頻TF就分別是 0.00二、0.035 和 0.005。 咱們將這三個數相加,其和 0.042 就是相應網頁和查詢「原子能的應用」。
一個詞預測主題能力越強,權重就越大,反之,權重就越小。咱們在網頁中看到「原子能」這個詞,或多或少地能瞭解網頁的主題。咱們看到「應用」一次,對主題基本上仍是一無所知。所以,「原子能「的權重就應該比應用大。
應刪除詞的權重應該是零。express
def regularization(s): ss = s.split(' ') expression = [] target = {} for i in ss: if i != "and" and i != "or" and i != "not" and i != "(" and i != ")": if i[0] == "(": expression.append("(") expression.append(i[1:]) target[i[1:]] = 0 elif i[-1] == ")": expression.append(i[:-1]) expression.append(")") target[i[:-1]] = 0 else: expression.append(i) target[i] = 0 else: expression.append(i) return target, expression def analysis(line): output = [] # 去除每行的換行符 t_line = line.strip('\n') # 按空格分開每一個詞 words = t_line.split(' ') for word in words[1:]: if word == "": continue # 按/分開標記和詞 t_word = word.split('/') # 左方括號去除 tf_word = t_word[0].split('[') if len(tf_word) == 2: f_word = tf_word[1] else: f_word = t_word[0] # 若不在列表中 if f_word not in output: output.append(f_word) big_word1 = t_line.split('[') for i in range(1, len(big_word1)): big_word2 = big_word1[i].split(']')[0] words = big_word2.split(' ') big_word = "" for word in words: # 按/分開標記和詞 t_word = word.split('/') big_word = big_word + t_word[0] # 若不在列表中 if big_word not in output: output.append(big_word) return output def getValue(target, reg): # 逆波蘭 RPN = [] stack = [] stack.append("#") for i in reg: if i in target.keys(): RPN.append(target[i]) elif i == "(": stack.append(i) elif i == ")": while stack[-1] != "(": RPN.append(stack.pop()) stack.pop() elif i == "not": while stack[-1] == "not": RPN.append(stack.pop()) stack.append(i) elif i == "and": while stack[-1] == "not" or stack[-1] == "and": RPN.append(stack.pop()) stack.append(i) else: while stack[-1] == "not" or stack[-1] == "and" or stack[-1] == "or": RPN.append(stack.pop()) stack.append(i) while len(stack) != 1: RPN.append(stack.pop()) # 計算逆波蘭式 ans = [] for i in RPN: if i == 0 or i == 1: ans.append(i) elif i == "not": ans.append(1 ^ ans.pop()) elif i == "and": op1 = ans.pop() op2 = ans.pop() ans.append(op1 and op2) elif i == "or": op1 = ans.pop() op2 = ans.pop() ans.append(op1 or op2) return ans[0] if __name__ == '__main__': booltext = input("輸入布爾表達式:") target, reg = regularization(booltext) key_target = target.keys() num = 0 with open('語料庫.txt', mode='r', encoding='UTF-8') as f: for line in f.readlines(): if num >=10: break for i in key_target: target[i] = 0 if line is not None and line != "\n": output = analysis(line) for i in key_target: if i in output: target[i] = 1 if getValue(target, reg): print(line) num = num + 1 f.close()
TF-IDF模型數組
getWeight.py(提早計算權重)app
import sys output = {} with open('語料庫.txt', mode='r', encoding='UTF-8') as f: for line in f.readlines(): if line is not None and line != "\n": t_line = line.strip('\n') words = t_line.split(' ') word_w = [] for word in words[1:]: if word == "": continue t_word = word.split('/') # 左方括號 tf_word = t_word[0].split('[') if len(tf_word) == 2: f_word = tf_word[1] else: f_word = t_word[0] if f_word not in word_w: word_w.append(f_word) for f_word in word_w: if f_word in output.keys(): output[f_word] = output[f_word]+1 else: output[f_word] = 1 f.close() with open('outputWeight.txt', mode='w', encoding='UTF-8') as f: while output: minNum = sys.maxsize minName = "" for key, values in output.items(): if values < minNum: minNum = values minName = key f.write(minName+": "+str(minNum)+"\n") del output[minName] f.close()
TF-IDF.pycode
import math def analysis(line): output = [] # 去除每行的換行符 t_line = line.strip('\n') # 按空格分開每一個詞 words = t_line.split(' ') for word in words[1:]: if word == "": continue # 按/分開標記和詞 t_word = word.split('/') # 左方括號去除 tf_word = t_word[0].split('[') if len(tf_word) == 2: f_word = tf_word[1] else: f_word = t_word[0] # 若不在列表中 if f_word not in output: output.append(f_word) big_word1 = t_line.split('[') for i in range(1, len(big_word1)): big_word2 = big_word1[i].split(']')[0] words = big_word2.split(' ') big_word = "" for word in words: # 按/分開標記和詞 t_word = word.split('/') big_word = big_word + t_word[0] # 若不在列表中 if big_word not in output: output.append(big_word) return output def getValue(target, reg): # 逆波蘭 RPN = [] stack = [] stack.append("#") for i in reg: if i in target.keys(): RPN.append(target[i]) elif i == "(": stack.append(i) elif i == ")": while stack[-1] != "(": RPN.append(stack.pop()) stack.pop() elif i == "not": while stack[-1] == "not": RPN.append(stack.pop()) stack.append(i) elif i == "and": while stack[-1] == "not" or stack[-1] == "and": RPN.append(stack.pop()) stack.append(i) else: while stack[-1] == "not" or stack[-1] == "and" or stack[-1] == "or": RPN.append(stack.pop()) stack.append(i) while len(stack) != 1: RPN.append(stack.pop()) # 計算逆波蘭式 ans = [] for i in RPN: if i == 0 or i == 1: ans.append(i) elif i == "not": ans.append(1 ^ ans.pop()) elif i == "and": op1 = ans.pop() op2 = ans.pop() ans.append(op1 and op2) elif i == "or": op1 = ans.pop() op2 = ans.pop() ans.append(op1 or op2) return ans[0] def getW(): word_list = {} with open('outputWeight.txt', mode='r', encoding='UTF-8')as f: for line in f.readlines(): if line is not None: word = line.split(':') word_list[word[0]]=word[1] f.close() return word_list def BMM(origin_sentence): MAX_WORD = 19 word_list = [] with open('output.txt', mode='r', encoding='UTF-8')as f: for line in f.readlines(): if line is not None: word = line.split(':') word_list.append(word[0]) f.close() ans_word = [] while len(origin_sentence) != 0: len_word = MAX_WORD while len_word > 0: # 從後讀取最大詞長度的數據,若該數據在字典中,則存入數組,並將其去除 if origin_sentence[-len_word:] in word_list: ans_word.append(origin_sentence[-len_word:]) len_sentence = len(origin_sentence) origin_sentence = origin_sentence[0:len_sentence - len_word] break # 不在詞典中,則從後取詞長度-1 else: len_word = len_word - 1 # 單詞直接存入數組 if len_word == 0: if origin_sentence[-1:] != ' ': ans_word.append(origin_sentence[-1:]) len_sentence = len(origin_sentence) origin_sentence = origin_sentence[0:len_sentence - 1] return ans_word if __name__ == '__main__': w = getW() sentence = input("輸入短語:") words = BMM(sentence) ans = [] # 計算總文檔數(一行一文檔) count = 0 for index, line in enumerate(open('語料庫.txt', 'r', encoding='UTF-8')): count += 1 with open('語料庫.txt', mode='r', encoding='UTF-8') as f: for line in f.readlines(): score = 0 if line is not None and line != "\n": out = analysis(line) for word in words: # TF-IDF計算 score = score + out.count(word) / len(out) * math.log(count*1.0/int(w[word])) ans.append((line, score)) f.close() new_ans = sorted(ans, key=lambda a: a[1], reverse=True) for i in range(10): print(new_ans[i])