Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| import torch | |
| from torch.utils.data import Dataset | |
| import numpy as np | |
| from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score | |
| from transformers import BertTokenizer, BertForSequenceClassification, Trainer,TrainingArguments | |
| # no augment dataset | |
| # df = df = pd.read_csv(r".\train_set.csv") | |
| # with augment training dataset | |
| df = pd.read_csv(r".\cleaned_combined_aug_set.csv") | |
| # df.info() | |
| value_counts = df['label'].value_counts() | |
| print(value_counts) | |
| test_df = pd.read_csv(r".\test_set.csv") | |
| # test_df.info() | |
| test_df['label'].value_counts() | |
| tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') | |
| model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2) | |
| model = model.to('cuda') | |
| # independent var | |
| X = list(df['article']) | |
| X_test = list(test_df['article']) | |
| #dependent | |
| y= list(df['label']) | |
| y_test = list(test_df['label']) | |
| max_length = 512 | |
| train_encodings = tokenizer(X, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt') | |
| test_encodings = tokenizer(X_test, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt') | |
| class CustomDataset(Dataset): | |
| def __init__(self, encodings, labels): | |
| self.encodings = encodings | |
| self.labels = labels | |
| def __getitem__(self, idx): | |
| item = {key: val[idx] for key, val in self.encodings.items()} | |
| item['labels'] = torch.tensor(self.labels[idx]) | |
| return item | |
| def __len__(self): | |
| return len(self.labels) | |
| torch_train_dataset = CustomDataset(train_encodings,y) | |
| torch_test_dataset = CustomDataset(test_encodings,y_test) | |
| training_args = TrainingArguments( | |
| output_dir='./results/fake-news-bert-aug', | |
| evaluation_strategy='epoch', | |
| learning_rate=2e-5, | |
| per_device_train_batch_size=16, | |
| per_device_eval_batch_size=16, | |
| num_train_epochs=3 | |
| ) | |
| def compute_metrics(p): | |
| print(type(p)) | |
| pred, labels = p | |
| pred = np.argmax(pred,axis=1) | |
| accuracy = accuracy_score(y_true=labels,y_pred=pred) | |
| recall = recall_score(y_true=labels,y_pred=pred) | |
| precision = precision_score(y_true=labels,y_pred=pred) | |
| f1 = f1_score(y_true=labels,y_pred=pred) | |
| return {"accuracy":accuracy,"precision":precision,"recall":recall,"f1":f1} | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=torch_train_dataset, | |
| eval_dataset=torch_test_dataset, | |
| compute_metrics=compute_metrics | |
| ) | |
| trainer.train() | |
| def predict(text): | |
| return trainer.predict(text) | |