Spaces:
Runtime error
Runtime error
| from glob import glob | |
| from tqdm import tqdm | |
| import numpy as np | |
| import pickle | |
| #from sklearn.model_selection import train_test_split | |
| import torch | |
| import os | |
| import ast | |
| #from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score | |
| from transformers import EarlyStoppingCallback | |
| from transformers import AutoConfig, AutoModel, AutoTokenizer, AutoModelForSequenceClassification | |
| from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler | |
| #from sklearn.utils import shuffle | |
| from transformers import get_cosine_schedule_with_warmup | |
| from torch.nn import functional as F | |
| import random | |
| import pandas as pd | |
| from .datas import make_dataset, make_extract_dataset | |
| from .utils import set_seed, accuracy_per_class, compute_metrics, model_eval, checkpoint_save, EarlyStopping, model_freeze, get_hidden | |
| from .model import classification_model | |
| from transformers import BigBirdTokenizer | |
| import transformers | |
| class NLP_classification(): | |
| def __init__(self, model_name=None, data_file=None, max_length=None, random_state=1000, task_type='onehot', freeze_layers=None, num_classifier=1, num_pos_emb_layer=1, gpu_num=0, sentence_piece=True, bertsum=False): | |
| self.model_name = model_name | |
| self.data_file = data_file | |
| self.max_length = max_length | |
| self.random_state = random_state | |
| self.task_type = task_type | |
| self.tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=False) | |
| if model_name == 'google/bigbird-roberta-base': | |
| self.tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base') | |
| self.config = AutoConfig.from_pretrained(model_name, num_labels=6) | |
| #self.pretrained_model = AutoModelForSequenceClassification.from_config(self.config) | |
| self.pretrained_model = AutoModel.from_config(self.config) | |
| self.freeze_layers=freeze_layers | |
| self.num_classifier=num_classifier | |
| self.num_pos_emb_layer=num_pos_emb_layer | |
| self.gpu_num=gpu_num | |
| self.sentence_piece=sentence_piece | |
| self.bertsum=bertsum | |
| if self.max_length is None: | |
| self.padding='longest' | |
| else: | |
| self.padding='max_length' | |
| def training(self, epochs=50, batch_size=4, lr=1e-5, dropout=0.1, data_cut=None, early_stop_count=10, | |
| wandb_log=False, wandb_project=None, wandb_group=None, wandb_name=None, wandb_memo=None): | |
| #os.environ["CUDA_VISIBLE_DEVICES"]= "{0}".format(int(self.gpu_num)) | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| #device = torch.device('cuda:{0}'.format(int(self.gpu_num))) | |
| #torch.cuda.set_device(device) | |
| set_seed(self.random_state) | |
| torch.set_num_threads(10) | |
| if wandb_log is True: | |
| import wandb | |
| wandb.init(project=wandb_project, reinit=True, group=wandb_group, notes=wandb_memo) | |
| wandb.run.name = wandb_name | |
| wandb.run.save() | |
| parameters = wandb.config | |
| parameters.lr = lr | |
| parameters.batch_size = batch_size | |
| parameters.dropout = dropout | |
| parameters.train_num = data_cut | |
| parameters.max_length = self.max_length | |
| parameters.model_name = self.model_name | |
| parameters.task_type = self.task_type | |
| '''data loading''' | |
| train_dataset, val_dataset = make_dataset(csv_file=self.data_file, tokenizer=self.tokenizer, max_length=self.max_length, padding=self.padding, random_state=self.random_state, data_cut=data_cut, sentence_piece=self.sentence_piece) | |
| '''loader making''' | |
| train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=RandomSampler(train_dataset)) | |
| val_loader = DataLoader(val_dataset, batch_size=batch_size, sampler=SequentialSampler(val_dataset)) | |
| ''' model load ''' | |
| model=classification_model(self.pretrained_model, self.config, num_classifier=self.num_classifier, num_pos_emb_layer=self.num_pos_emb_layer, bertsum=self.bertsum, device=device) | |
| model=model_freeze(model, self.freeze_layers) | |
| model.to(device) | |
| ''' running setting ''' | |
| loss_fn = torch.nn.BCEWithLogitsLoss() | |
| optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr, eps=1e-8) | |
| scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=(len(train_loader)*epochs)) | |
| early_stopping = EarlyStopping(patience = early_stop_count, verbose = True) | |
| ''' running ''' | |
| best_epoch = None | |
| best_val_f1 = None | |
| for epoch in range(epochs): | |
| model.train() | |
| loss_all = 0 | |
| step = 0 | |
| for data in tqdm(train_loader): | |
| input_ids=data['input_ids'].to(device, dtype=torch.long) | |
| mask = data['attention_mask'].to(device, dtype=torch.long) | |
| token_type_ids = data['token_type_ids'].to(device, dtype=torch.long) | |
| if self.task_type=='onehot': | |
| targets=data['label_onehot'].to(device, dtype=torch.float) | |
| elif self.task_type=='scalar': | |
| targets=data['label'].to(device, dtype=torch.long) | |
| position = data['position'] | |
| inputs = {'input_ids': input_ids, 'attention_mask': mask, 'token_type_ids': token_type_ids, | |
| 'labels': targets, 'position': position} | |
| if self.sentence_piece: | |
| sentence_batch = data['sentence_batch'].to(device, dtype=torch.long) | |
| inputs = {'input_ids': input_ids, 'attention_mask': mask, 'token_type_ids': token_type_ids, | |
| 'labels': targets, 'sentence_batch': sentence_batch, 'position': position} | |
| outputs = model(inputs) | |
| output = outputs[1] | |
| loss = outputs[0] | |
| optimizer.zero_grad() | |
| #loss=loss_fn(output, targets) | |
| loss_all += loss.item() | |
| loss.backward() | |
| optimizer.step() | |
| scheduler.step() | |
| #print(optimizer.param_groups[0]['lr']) | |
| train_loss = loss_all/len(train_loader) | |
| val_loss, val_acc, val_precision, val_recall, val_f1 = model_eval(model, device, val_loader, task_type=self.task_type, sentence_piece=self.sentence_piece) | |
| if wandb_log is True: | |
| wandb.log({'train_loss':train_loss, 'val_loss':val_loss, 'val_acc':val_acc, | |
| 'val_precision':val_precision, 'val_recall':val_recall, 'val_f1':val_f1}) | |
| if best_val_f1 is None or val_f1 >= best_val_f1: | |
| best_epoch = epoch+1 | |
| best_val_f1 = val_f1 | |
| checkpoint_save(model, val_f1, wandb_name=wandb_name) | |
| print('Epoch: {:03d}, Train Loss: {:.7f}, Val Loss: {:.7f}, Val Acc: {:.7f}, Val Precision: {:.7f}, Val Recall: {:.7f}, Val F1: {:.7f} '.format(epoch+1, train_loss, val_loss, val_acc, val_precision, val_recall, val_f1)) | |
| early_stopping(val_f1) | |
| if early_stopping.early_stop: | |
| print("Early stopping") | |
| break | |
| wandb.finish() | |
| def prediction(self, selected_model=None, batch_size=8): | |
| #os.environ["CUDA_VISIBLE_DEVICES"]= "{0}".format(int(self.gpu_num)) | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| set_seed(self.random_state) | |
| torch.set_num_threads(10) | |
| task_type=self.task_type | |
| '''data loading''' | |
| train_dataset, val_dataset = make_dataset(csv_file=self.data_file, tokenizer=self.tokenizer, max_length=self.max_length, padding=self.padding, random_state=self.random_state, data_cut=None, sentence_piece=self.sentence_piece) | |
| '''loader making''' | |
| train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=RandomSampler(train_dataset)) | |
| val_loader = DataLoader(val_dataset, batch_size=batch_size, sampler=SequentialSampler(val_dataset)) | |
| ''' model load ''' | |
| model=classification_model(self.pretrained_model, self.config, num_classifier=self.num_classifier, num_pos_emb_layer=self.num_pos_emb_layer, bertsum=self.bertsum, device=device) | |
| model.load_state_dict(torch.load(selected_model)) | |
| model.to(device) | |
| ''' prediction ''' | |
| print('start trainset prediction') | |
| train_results = model_eval(model, device, train_loader, task_type=self.task_type, return_values=True, sentence_piece=self.sentence_piece) | |
| print('start evalset prediction') | |
| eval_results = model_eval(model, device, val_loader, task_type=self.task_type, return_values=True, sentence_piece=self.sentence_piece) | |
| print('train result: acc:{0} | precision:{1} | recall:{2} | f1:{3}'.format(train_results[1], train_results[2], train_results[3], train_results[4])) | |
| print('eval result: acc:{0} | precision:{1} | recall:{2} | f1:{3}'.format(eval_results[1], eval_results[2], eval_results[3], eval_results[4])) | |
| total_text = train_results[7] + eval_results[7] | |
| total_out = train_results[6] + eval_results[6] | |
| total_target = train_results[5] + eval_results[5] | |
| if self.task_type == 'onehot': | |
| total_out = [i.argmax() for i in total_out] | |
| total_target = [i.argmax() for i in total_target] | |
| total_data = {'text':total_text, 'label':total_target, 'predict':total_out} | |
| total_df = pd.DataFrame(total_data) | |
| ''' result return ''' | |
| return total_df | |
| def get_embedding(self, selected_model=None, batch_size=8, return_hidden=True, return_hidden_pretrained=False): | |
| #os.environ["CUDA_VISIBLE_DEVICES"]= "{0}".format(int(self.gpu_num)) | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| #device = torch.device('cuda:{0}'.format(int(self.gpu_num))) | |
| #torch.cuda.set_device(device) | |
| set_seed(self.random_state) | |
| torch.set_num_threads(10) | |
| task_type=self.task_type | |
| '''data loading''' | |
| train_dataset, val_dataset = make_dataset(csv_file=self.data_file, tokenizer=self.tokenizer, max_length=self.max_length, padding=self.padding, random_state=self.random_state, data_cut=None, sentence_piece=self.sentence_piece) | |
| '''loader making''' | |
| train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=RandomSampler(train_dataset)) | |
| val_loader = DataLoader(val_dataset, batch_size=batch_size, sampler=SequentialSampler(val_dataset)) | |
| ''' model load ''' | |
| model=classification_model(self.pretrained_model, self.config, num_classifier=self.num_classifier, num_pos_emb_layer=self.num_pos_emb_layer, bertsum=self.bertsum, device=device) | |
| model.return_hidden = return_hidden | |
| model.return_hidden_pretrained = return_hidden_pretrained | |
| if selected_model is not None: | |
| model.load_state_dict(torch.load(selected_model)) | |
| model.to(device) | |
| ''' get hidden ''' | |
| print('start make hidden states (trainset)') | |
| train_hiddens, train_targets = get_hidden(model, device, train_loader, task_type=self.task_type, sentence_piece=self.sentence_piece) | |
| print('start evalset prediction (eval set)') | |
| eval_hiddens, eval_targets = get_hidden(model, device, val_loader, task_type=self.task_type, sentence_piece=self.sentence_piece) | |
| total_hiddens = np.array(train_hiddens + eval_hiddens) | |
| total_targets = np.array(train_targets + eval_targets) | |
| return total_hiddens, total_targets | |
| def label_extraction(self, paragraphs, positions, selected_model=None, batch_size=16): | |
| label_dict = {'Abstract':0, 'Introduction':1, 'Main':2, 'Methods':3, 'Summary':4, 'Captions':5} | |
| #os.environ["CUDA_VISIBLE_DEVICES"]= "{0}".format(int(self.gpu_num)) | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| set_seed(self.random_state) | |
| torch.set_num_threads(10) | |
| ''' data to list ''' | |
| is_list = True | |
| if not isinstance(paragraphs, list): | |
| paragraphs = [paragraphs] | |
| is_list = False | |
| if not isinstance(positions, list): | |
| positions = [positions] | |
| is_list = False | |
| '''data encoding''' | |
| dataset = make_extract_dataset(paragraphs, positions, tokenizer=self.tokenizer, max_length=self.max_length) | |
| '''loader making''' | |
| data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False) | |
| ''' model load ''' | |
| model=classification_model(self.pretrained_model, self.config, num_classifier=self.num_classifier, num_pos_emb_layer=self.num_pos_emb_layer, bertsum=self.bertsum, device=device) | |
| model.load_state_dict(torch.load(selected_model)) | |
| model.to(device) | |
| ''' prediction ''' | |
| model.eval() | |
| predicts = [] | |
| with torch.no_grad(): | |
| for batch in tqdm(data_loader): | |
| inputs = {} | |
| inputs['input_ids'] = batch['input_ids'].to(device) | |
| inputs['attention_mask'] = batch['attention_mask'].to(device) | |
| inputs['token_type_ids'] = batch['token_type_ids'].to(device) | |
| inputs['position'] = batch['position'] | |
| outputs = model(inputs) | |
| logits = outputs[1] | |
| logits = logits.detach().cpu().numpy() | |
| logits = logits.argmax(axis=1).flatten() | |
| logits = logits.tolist() | |
| predicts.extend(logits) | |
| predicts = [list(label_dict.keys())[list(label_dict.values()).index(i)] for i in predicts] | |
| if not is_list: | |
| predicts = predicts[0] | |
| return predicts | |