File size: 4,075 Bytes
221fbfe 91091b5 221fbfe 1d45fbc 221fbfe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
---
license: cc0-1.0
language:
- fi
base_model:
- TurkuNLP/bert-base-finnish-uncased-v1
tags:
- difficulty
- cefr
- regression
- bert
---
# Text Difficulty Regression Model
Regression model which predicts difficulty score for an input text. Predicted scores can be mapped to CEFR levels.
## Model Details
Frozen BERT-large layers with a regressor on top. Trained on a mix of manually annotated datasets (more details on data will follow) and data translated from Russian into Finnish.
## How to Get Started with the Model
Use the code below to get started with the model.
```
class CustomModel(BertPreTrainedModel):
def __init__(self, config, load_path=None, use_auth_token: str = None,):
super().__init__(config)
self.bert = BertModel(config)
self.pre_classifier = nn.Linear(config.hidden_size, 128)
self.dropout = nn.Dropout(0.2)
self.classifier = nn.Linear(128, 1)
self.activation = nn.ReLU()
nn.init.kaiming_uniform_(self.pre_classifier.weight, nonlinearity='relu')
nn.init.kaiming_uniform_(self.classifier.weight, nonlinearity='relu')
if self.pre_classifier.bias is not None:
nn.init.constant_(self.pre_classifier.bias, 0)
if self.classifier.bias is not None:
nn.init.constant_(self.classifier.bias, 0)
def forward(
self,
input_ids,
labels=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
):
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
)
pooled_output = outputs.pooler_output
pooled_output = self.pre_classifier(pooled_output)
pooled_output = self.activation(pooled_output)
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
if labels is not None:
loss_fn = nn.MSELoss()
loss = loss_fn(logits.view(-1), labels.view(-1))
return loss, logits
else:
return None, logits
# Inference
from safetensors.torch import load_file
# Code to load custom fine-tuned model'
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
config.num_labels = 1
# Load your custom model
model = CustomModel(config)
state_dict = load_file(f'{model_path}/model.safetensors')
model.load_state_dict(state_dict)
model.eval()
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
inputs = {key: value.to(device) for key, value in inputs.items()}
with torch.no_grad():
_, logits = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], token_type_ids=inputs["token_type_ids"])
```
To map to CEFR, use:
```
reg2cl2 = {
"0.0": "A1", "1.0": "A1", "1.5": "A1-A2", "2.0": "A2",
"2.5": "A2-B1", "3.0": "B1", "3.5": "B1-B2", "4.0": "B2",
"4.5": "B2-C1", "5.0": "C1", "5.5": "C1-C2", "6.0": "C2"
}
print("Predicted output (logits):", logits.item(), reg2cl2[str(float(round(logits.item())))])
```
## Training Details
#### Training Hyperparameters
+ num_warmup_steps = int(0.1 * num_training_steps)
+ num_train_epochs: 24.0
+ batch_size: 16
+ weight_decay: 0.01
+ adam_beta1: 0.9
+ adam_beta2: 0.99
+ adam_epsilon: 1e-8
+ max_grad_norm: 1.0
+ fp16: True
+ early_stopping: True
#### Learning rates
```
# Define separate learning rates
lr_bert = 2e-5 # Learning rate for BERT layers
lr_classifier = 1e-3 # Learning rate for the classifier
optimizer = torch.optim.AdamW([
{"params": model.bert.parameters(), "lr": lr_bert}, # BERT layers
{"params": model.classifier.parameters(), "lr": lr_classifier},
{"params": model.pre_classifier.parameters(), "lr": lr_classifier},
])
```
## Evaluation on test set

## Citation
Please refer to this repo when using the model. |