一、安裝hugging face的transformershtml
pip install transformersgit
二、下載相關文件json
字表:app
wget http://52.216.242.246/models.huggingface.co/bert/bert-base-uncased-vocab.txt框架
配置文件:dom
wget http://52.216.242.246/models.huggingface.co/bert/bert-base-uncased-config.json函數
模型文件:學習
wget http://52.216.242.246/models.huggingface.co/bert/bert-base-uncased-pytorch_model.binui
三、數據集spa
simplifyweibo_4_moods數據集:
數據集下載地址: https://pan.baidu.com/s/16c93E5x373nsGozyWevITg
36 萬多條,帶情感標註,包含 4 種情感,其中喜悅約 20 萬條,憤怒、厭惡、低落各約 5 萬條
0:喜悅 1:憤怒 2:厭惡 3:低落
四、完整代碼
目錄結構:谷歌colab中
import os import pprint import numpy as np import pandas as pd import random from transformers import BertTokenizer,BertConfig,BertForSequenceClassification,AdamW,AutoTokenizer,AutoModel from transformers import get_linear_schedule_with_warmup from sklearn.metrics import accuracy_score from sklearn.metrics import classification_report from sklearn.model_selection import train_test_split import torch from torch.utils.data import TensorDataset, DataLoader class MyBertModel: def __init__(self, train, vocab_path, config_path, pretrain_Model_path, saveModel_path, learning_rate, n_class,epochs, batch_size, val_batch_size, max_len, gpu=True): self.n_class = n_class #類別數 self.max_len = max_len #句子最大長度 self.lr = learning_rate #學習率 self.epochs = epochs self.tokenizer = BertTokenizer.from_pretrained(vocab_path) #加載分詞模型 text_list, labels = self.load_data(train) #加載訓練數據集 train_x, val_x, train_y, val_y = self.split_train_val(text_list, labels) self.train = self.process_data(train_x, train_y) self.validation = self.process_data(val_x, val_y) self.batch_size = batch_size #訓練集的batch_size self.val_batch_size = val_batch_size self.saveModel_path = saveModel_path #模型存儲位置 self.gpu = gpu #是否使用gpu config = BertConfig.from_json_file(config_path) #加載bert模型配置信息 config.num_labels = n_class #設置分類模型的輸出個數 self.model = BertForSequenceClassification.from_pretrained(pretrain_Model_path,config=config) #加載bert分類模型 if self.gpu: seed = 42 random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.cudnn.deterministic = True self.device = torch.device('cuda') else: self.device = 'cpu' def encode_fn(self,text_list): #將text_list embedding成bert模型可用的輸入形式 #text_list:['我愛你','貓不是狗'] tokenizer = self.tokenizer( text_list, padding = True, truncation = True, max_length = self.max_len, return_tensors='pt' # 返回的類型爲pytorch tensor ) input_ids = tokenizer['input_ids'] token_type_ids = tokenizer['token_type_ids'] attention_mask = tokenizer['attention_mask'] return input_ids,token_type_ids,attention_mask def load_data(self,path): df = pd.read_csv(path) text_list = df['review'].to_list() labels = df['label'].to_list() return text_list, labels def process_data(self, text_list, labels): input_ids,token_type_ids,attention_mask = self.encode_fn(text_list) labels = torch.tensor(labels) data = TensorDataset(input_ids,token_type_ids,attention_mask,labels) return data def split_train_val(self, data, labels): train_x, val_x, train_y, val_y = train_test_split(data, labels, test_size = 0.2, random_state = 0) return train_x, val_x, train_y, val_y def flat_accuracy(self, preds, labels): """A function for calculating accuracy scores""" pred_flat = np.argmax(preds, axis=1).flatten() labels_flat = labels.flatten() return accuracy_score(labels_flat, pred_flat) def train_model(self): #訓練模型 if self.gpu: self.model.cuda() optimizer = AdamW(self.model.parameters(), lr=self.lr) trainData = DataLoader(self.train, batch_size = self.batch_size, shuffle = True) #處理成多個batch的形式 valData = DataLoader(self.validation, batch_size = self.val_batch_size, shuffle = True) total_steps = len(trainData) * self.epochs scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) for epoch in range(self.epochs): self.model.train() total_loss, total_val_loss = 0, 0 total_eval_accuracy = 0 print('epoch:' , epoch , ', step_number:' , len(trainData)) #訓練 for step,batch in enumerate(trainData): outputs = self.model(input_ids = batch[0].to(self.device), token_type_ids=batch[1].to(self.device), attention_mask=batch[2].to(self.device), labels=batch[3].to(self.device) ) #輸出loss 和 每一個分類對應的輸出,softmax後纔是預測是對應分類的機率 loss, logits = outputs.loss, outputs.logits total_loss += loss.item() loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0) optimizer.step() scheduler.step() if step % 10 == 0 and step > 0: #每10步輸出一下訓練的結果,flat_accuracy()會對logits進行softmax self.model.eval() logits = logits.detach().cpu().numpy() label_ids = batch[3].cuda().data.cpu().numpy() avg_val_accuracy = self.flat_accuracy(logits, label_ids) print('step:' , step) print(f'Accuracy: {avg_val_accuracy:.4f}') print('\n') #每一個epoch結束,就使用validation數據集評估一次模型 self.model.eval() print('testing ....') for i, batch in enumerate(valData): with torch.no_grad(): loss, logits = self.model(input_ids=batch[0].to(self.device), token_type_ids=batch[1].to(self.device), attention_mask=batch[2].to(self.device), labels=batch[3].to(self.device) ) total_val_loss += loss.item() logits = logits.detach().cpu().numpy() label_ids = batch[3].cuda().data.cpu().numpy() total_eval_accuracy += self.flat_accuracy(logits, label_ids) avg_train_loss = total_loss / len(trainData) avg_val_loss = total_val_loss / len(valData) avg_val_accuracy = total_eval_accuracy / len(valData) print(f'Train loss : {avg_train_loss}') print(f'Validation loss: {avg_val_loss}') print(f'Accuracy: {avg_val_accuracy:.4f}') print('\n') self.save_model(self.saveModel_path + '-' + str(epoch)) def save_model(self , path): #保存分詞模型和分類模型 self.model.save_pretrained(path) self.tokenizer.save_pretrained(path) def load_model(self,path): #加載分詞模型和分類模型 tokenizer = AutoTokenizer.from_pretrained(path) model = BertForSequenceClassification.from_pretrained(path) return tokenizer,model def eval_model(self,Tokenizer, model,text_list,y_true): #輸出模型的召回率、準確率、f1-score preds = self.predict_batch(Tokenizer, model, text_list) print(classification_report(y_true,preds)) def predict_batch(self, Tokenizer, model, text_list): tokenizer = Tokenizer( text_list, padding = True, truncation = True, max_length = self.max_len, return_tensors='pt' # 返回的類型爲pytorch tensor ) input_ids = tokenizer['input_ids'] token_type_ids = tokenizer['token_type_ids'] attention_mask = tokenizer['attention_mask'] pred_data = TensorDataset(input_ids,token_type_ids,attention_mask) pred_dataloader = DataLoader(pred_data, batch_size=self.batch_size, shuffle=False) model = model.to(self.device) model.eval() preds = [] for i, batch in enumerate(pred_dataloader): with torch.no_grad(): outputs = model(input_ids=batch[0].to(self.device), token_type_ids=batch[1].to(self.device), attention_mask=batch[2].to(self.device) ) logits = outputs[0] logits = logits.detach().cpu().numpy() preds += list(np.argmax(logits, axis=1)) return preds if __name__ == '__main__': epoch = 3 pretrained_path = "/content/drive/Shareddrives/xiximayou/pretrained/bert-base-uncased" dataset_path = "/content/drive/Shareddrives/xiximayou/datasets" save_path = "/content/drive/MyDrive/test_hugging face/checkpoint" train_path = os.path.join(dataset_path, "simplifyweibo_4_moods/simplifyweibo_4_moods.csv") save_model_path = os.path.join(dataset_path, "pt/bert-base-uncased") bert_tokenizer_path = os.path.join(pretrained_path, "bert-base-uncased-vocab.txt") bert_config_path = os.path.join(pretrained_path, "bert-base-uncased-config.json") bert_model_path = os.path.join(pretrained_path, "bert-base-uncased-pytorch_model.bin") model_name = "bert_weibo" myBertModel = MyBertModel( train = train_path, vocab_path = bert_tokenizer_path, config_path = bert_config_path, pretrain_Model_path = bert_model_path, saveModel_path = os.path.join(save_model_path, model_name), learning_rate = 2e-5, n_class = 4, epochs = epoch, batch_size = 4, val_batch_size = 4, max_len = 100 , gpu = True ) myBertModel.train_model() Tokenizer, model = myBertModel.load_model(myBertModel.saveModel_path + '-'+ str(epoch-1)) #text_list, y_true = myBertModel.load_data_predict('xxx.csv') #myBertModel.eval_model(Tokenizer, model,text_list,y_true)
使用模型獲得的輸出是:
SequenceClassifierOutput(loss=tensor(1.2209, device='cuda:0', grad_fn=<NllLossBackward>), logits=tensor([[-0.0275, -0.2090, -0.1251, -0.2942], [ 0.0310, -0.2028, -0.1399, -0.3605], [-0.0671, -0.3543, -0.1225, -0.4625], [ 0.1389, -0.1244, -0.2310, -0.3664]], device='cuda:0', grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
所以咱們使用outputs.loss和outputs.logits來得到相關的值。
from transformers import ( DataProcessor, InputExample, BertConfig, BertTokenizer, BertForSequenceClassification, glue_convert_examples_to_features, ) import torch.nn as nn import torch
class Processor(DataProcessor): def __init__(self): super(DataProcessor,self).__init__() """實體連接數據處理""" def get_train_examples(self, inputexample): return self._create_examples( lines, labels, set_type='train', ) def get_test_examples(self, inputexample): return self._create_examples( lines, labels, set_type='test', ) def get_labels(self): return [0, 1] def _create_examples(self, lines, labels,set_type):# 將 document_examples=[] for i, document in enumerate(lines): sentence_examples = [] for j,sentence in enumerate(document): guid = f'{set_type}-{i}-{j}' text_a = sentence label = labels[i][j] sentence_examples.append(InputExample( guid=guid, text_a=text_a, label=label, )) document_examples.append(sentence_examples) return document_examples def _create_features(self,document_examples): features = [] for document_example in document_examples: examples = document_example document_features = glue_convert_examples_to_features( examples, tokenizer, max_length=30, label_list=get_labels(), output_mode= 'classification' ) features.append(document_features) return features def creat_train_dataloader(self,train_features): dataloader =[] for document_features in train_features: input_ids = torch.LongTensor([f.input_ids for f in document_features]) attention_mask = torch.LongTensor([f.attention_mask for f in document_features]) token_type_ids = torch.LongTensor([f.token_type_ids for f in document_features]) labels = torch.LongTensor([f.label for f in document_features]) batch = [input_ids,attention_mask,token_type_ids,labels] dataloader.append(batch) return dataloader
def generate_dataloader(self,data): train_dataloader = self.creat_train_dataloader(self._create_features((self._create_examples(data[0],data[1],'train')))) valid_dataloader = self.creat_train_dataloader(self._create_features((self._create_examples(data[2],data[3],'valid')))) test_dataloader = self.creat_train_dataloader(self._create_features((self._create_examples(data[4],data[5],'test')))) return train_dataloader,valid_dataloader,test_dataloader
繼承DataProcessor類