#英文預處理python
1 #按空格進行分詞,同時針對推文一些特性,去除@用戶名,保留表情等一些特殊符號 2 tokenizer = TweetTokenizer() 3 for counter,rev in enumerate(reviews): 4 # 去除HTML網頁格式 5 temp = BeautifulSoup(rev) 6 text = temp.get_text() 7 # 去除空格 8 text = re.sub(' +',' ',text) 9 test = re.sub(r'[()\[\]{}.,;:!?\<=>?@_^#$%"&*-],' ',text) 10 # strip leading and trailing white space 11 text = text.strip() 12 # tokenize 13 tokens = tokenizer.tokenize(text) 14 cleaned_reviews.append(tokens) 15 if counter % round(len(reviews)/10) == 0: 16 print(counter, '/', len(reviews), 'reviews cleaned') 17 # get list of tokens from all reviews 18 # 兩個list變成一個list 19 all_tokens = [token for sublist in cleaned_reviews for token in sublist] 20 # 根據詞頻作index, 把單詞轉成index 21 counts = dict(Counter(all_tokens)) 22 sorted_counts = sorted(counts.items(), key=operator.itemgetter(1), reverse=True) 23 # assign to each word an index based on its frequency in the corpus 24 # the most frequent word will get index equal to 1 25 word_to_index = dict([(tuple[0],idx+1) for idx, tuple in enumerate(sorted_counts)]) 26 with open(path_to_IMDB + 'word_to_index_new.json', 'w') as my_file: 27 json.dump(word_to_index, my_file, sort_keys=True, indent=4)
詞共現矩陣的構建git
https://github.com/urgedata/pythondata/blob/master/Text%20Analytics/ericbrown.ipynb github
#中文預處理正則表達式
#jieba分詞和去停用詞 #jieba 分詞能夠將咱們的自定義詞典導入,格式 「詞」 「詞性」 「詞頻」 jieba.load_userdict('data/userdict.txt') #定義一個keyword類 class keyword(object): def Chinese_Stopwords(self): #導入停用詞庫 stopword=[] cfp=open('data/stopWord.txt','r+','utf-8') #停用詞的txt文件 for line in cfp: for word in line.split(): stopword.append(word) cfp.close() return stopword def Word_cut_list(self,word_str): #利用正則表達式去掉一些一些標點符號之類的符號。 word_str = re.sub(r'\s+', ' ', word_str) # trans 多空格 to空格 word_str = re.sub(r'\n+', ' ', word_str) # trans 換行 to空格 word_str = re.sub(r'\t+', ' ', word_str) # trans Tab to空格 word_str = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——;!,」。《》,。:「?、~@#¥%……&*()1234567①②③④)]+".decode("utf8"), "".decode("utf8"), word_str) wordlist = list(jieba.cut(word_str))#jieba分詞 wordlist_N = [] chinese_stopwords=self.Chinese_Stopwords() for word in wordlist: if word not in chinese_stopwords:#詞語的清洗:去停用詞 if word != '\r\n' and word!=' ' and word != '\u3000'.decode('unicode_escape') \ and word!='\xa0'.decode('unicode_escape'):#詞語的清洗:去全角空格 wordlist_N.append(word) return wordlist_N
#名詞提取 def Word_pseg(self,word_str): # 名詞提取函數 words = pseg.cut(word_str) word_list = [] for wds in words: # 篩選自定義詞典中的詞,和各種名詞,自定義詞庫的詞在沒設置詞性的狀況下默認爲x詞性,即詞的flag詞性爲x if wds.flag == 'x' and wds.word != ' ' and wds.word != 'ns' \ or re.match(r'^n', wds.flag) != None \ and re.match(r'^nr', wds.flag) == None: word_list.append(wds.word) return word_list
import tensorflow.contrib.keras as kr def read_file(filename): """讀取文件數據""" contents, labels = [], [] with open_file(filename) as f: for line in f: try: label, content = line.strip().split('\t') if content: contents.append(list(content))#經過list把一句話分紅一個個字 labels.append(native_content(label)) except: pass return contents, labels def build_vocab(train_dir, vocab_dir, vocab_size=5000): """根據訓練集構建詞彙表,存儲""" data_train, _ = read_file(train_dir) all_data = [] for content in data_train: all_data.extend(content) counter = Counter(all_data) count_pairs = counter.most_common(vocab_size - 1) #輸出幾個出現次數最多的元素 words, _ = list(zip(*count_pairs)) #經過zip只取出其中的單詞 # 添加一個 <PAD> 來將全部文本pad爲同一長度 words = ['<PAD>'] + list(words) open_file(vocab_dir, mode='w').write('\n'.join(words) + '\n') def read_vocab(vocab_dir): """讀取詞彙表""" # words = open_file(vocab_dir).read().strip().split('\n') with open_file(vocab_dir) as fp: # 若是是py2 則每一個值都轉化爲unicode words = [native_content(_.strip()) for _ in fp.readlines()] word_to_id = dict(zip(words, range(len(words)))) return words, word_to_id def process_file(filename, word_to_id, cat_to_id, max_length=600): """將文件轉換爲id表示""" contents, labels = read_file(filename) data_id, label_id = [], [] for i in range(len(contents)): data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id]) label_id.append(cat_to_id[labels[i]]) # 使用keras提供的pad_sequences來將文本pad爲固定長度 x_pad = kr.preprocessing.sequence.pad_sequences(data_id, max_length) y_pad = kr.utils.to_categorical(label_id, num_classes=len(cat_to_id)) # 將標籤轉換爲one-hot表示 return x_pad, y_pad
#創建詞表 text = open(path,encoding='utf-8').read().lower() chars = set(text) print ('total chars:', len(chars)) char_indices = dict((c, i) for i, c in enumerate(chars)) indices_char = dict((i, c) for i, c in enumerate(chars)) #kreas下運行LSTM的Input生成,在創建詞表的基礎上,數據向量化 print('Vectorization...') X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool) print(X) y = np.zeros((len(sentences), len(chars)), dtype=np.bool) for i, sentence in enumerate(sentences): for t, char in enumerate(sentence): X[i, t, char_indices[char]] = 1 y[i, char_indices[next_chars[i]]] = 1
# 過濾詞長,過濾停用詞,只保留中文 def is_fine_word(word, min_length=2): rule = re.compile(r"^[\u4e00-\u9fa5]+$") if len(word) >= min_length and word not in STOP_WORDS and re.search(rule, word): return True else: return False
#逐字切分的處理方式,同時去掉一些常見的虛詞,如「之」、「乎」、「者」、「也」。 def singCut(text): tex = [i.strip('\n').strip('\r').strip('。').strip(',|)|:|{|}|「|」 |(|\n') for i in text] return list(filter(None, tex)) #去掉空字符 text = '雲橫秦嶺家何在,雪擁藍關馬不前' #虛詞通用詞庫 stopwords = '而|何|乎|乃|且|其|若|所|爲' #去掉標點 poem = [[i.strip(') |: |?|{|}| 「|」 (| \n\n\r|。') for i in tex if i not in stopwords]for tex in text] poem = list(filter(None, poem ))
預處理(去特殊符號、去停用詞、分詞) json
把詞轉成index(word to index), 把原文都變成數值app
去掉topN詞頻的以及小於TOPM詞頻的函數
對每篇進行 truncation and paddingui
word2vec訓練 獲得 w2v_model[word] 的embedding,加入CNN做爲初始值(kreas裏面訓練須要把每一個詞轉成embedding這種)spa
訓練CNN模型code
https://github.com/Tixierae/deep_learning_NLP
構建詞彙表
categories轉成id, 讀取詞彙表,構建word_to_id字典(字符級別)
讀入訓練數據,預處理,將文本pad到固定長度
批次訓練CNN(tensorflow內部會自動初始化embedding)
預測
https://github.com/gaussic/text-classification-cnn-rnn
引用連接:
https://www.jianshu.com/p/aea87adee163