|
|
--- |
|
|
license: cc0-1.0 |
|
|
language: |
|
|
- fi |
|
|
base_model: |
|
|
- TurkuNLP/bert-base-finnish-uncased-v1 |
|
|
tags: |
|
|
- difficulty |
|
|
- cefr |
|
|
- regression |
|
|
- bert |
|
|
--- |
|
|
|
|
|
# Text Difficulty Regression Model |
|
|
|
|
|
Regression model which predicts difficulty score for an input text. Predicted scores can be mapped to CEFR levels. |
|
|
|
|
|
## Model Details |
|
|
|
|
|
Frozen BERT-large layers with a regressor on top. Trained on a mix of manually annotated datasets (more details on data will follow) and data translated from Russian into Finnish. |
|
|
|
|
|
## How to Get Started with the Model |
|
|
|
|
|
Use the code below to get started with the model. |
|
|
|
|
|
``` |
|
|
class CustomModel(BertPreTrainedModel): |
|
|
def __init__(self, config, load_path=None, use_auth_token: str = None,): |
|
|
super().__init__(config) |
|
|
self.bert = BertModel(config) |
|
|
self.pre_classifier = nn.Linear(config.hidden_size, 128) |
|
|
self.dropout = nn.Dropout(0.2) |
|
|
self.classifier = nn.Linear(128, 1) |
|
|
self.activation = nn.ReLU() |
|
|
|
|
|
nn.init.kaiming_uniform_(self.pre_classifier.weight, nonlinearity='relu') |
|
|
nn.init.kaiming_uniform_(self.classifier.weight, nonlinearity='relu') |
|
|
if self.pre_classifier.bias is not None: |
|
|
nn.init.constant_(self.pre_classifier.bias, 0) |
|
|
if self.classifier.bias is not None: |
|
|
nn.init.constant_(self.classifier.bias, 0) |
|
|
|
|
|
|
|
|
def forward( |
|
|
self, |
|
|
input_ids, |
|
|
labels=None, |
|
|
attention_mask=None, |
|
|
token_type_ids=None, |
|
|
position_ids=None, |
|
|
): |
|
|
outputs = self.bert( |
|
|
input_ids, |
|
|
attention_mask=attention_mask, |
|
|
token_type_ids=token_type_ids, |
|
|
position_ids=position_ids, |
|
|
) |
|
|
|
|
|
pooled_output = outputs.pooler_output |
|
|
pooled_output = self.pre_classifier(pooled_output) |
|
|
pooled_output = self.activation(pooled_output) |
|
|
pooled_output = self.dropout(pooled_output) |
|
|
logits = self.classifier(pooled_output) |
|
|
|
|
|
if labels is not None: |
|
|
loss_fn = nn.MSELoss() |
|
|
loss = loss_fn(logits.view(-1), labels.view(-1)) |
|
|
return loss, logits |
|
|
else: |
|
|
return None, logits |
|
|
|
|
|
|
|
|
# Inference |
|
|
from safetensors.torch import load_file |
|
|
# Code to load custom fine-tuned model' |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) |
|
|
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) |
|
|
config.num_labels = 1 |
|
|
|
|
|
# Load your custom model |
|
|
model = CustomModel(config) |
|
|
state_dict = load_file(f'{model_path}/model.safetensors') |
|
|
model.load_state_dict(state_dict) |
|
|
model.eval() |
|
|
|
|
|
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) |
|
|
inputs = {key: value.to(device) for key, value in inputs.items()} |
|
|
|
|
|
with torch.no_grad(): |
|
|
_, logits = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], token_type_ids=inputs["token_type_ids"]) |
|
|
``` |
|
|
|
|
|
|
|
|
To map to CEFR, use: |
|
|
``` |
|
|
reg2cl2 = { |
|
|
"0.0": "A1", "1.0": "A1", "1.5": "A1-A2", "2.0": "A2", |
|
|
"2.5": "A2-B1", "3.0": "B1", "3.5": "B1-B2", "4.0": "B2", |
|
|
"4.5": "B2-C1", "5.0": "C1", "5.5": "C1-C2", "6.0": "C2" |
|
|
} |
|
|
|
|
|
print("Predicted output (logits):", logits.item(), reg2cl2[str(float(round(logits.item())))]) |
|
|
``` |
|
|
|
|
|
## Training Details |
|
|
|
|
|
|
|
|
#### Training Hyperparameters |
|
|
|
|
|
+ num_warmup_steps = int(0.1 * num_training_steps) |
|
|
+ num_train_epochs: 24.0 |
|
|
+ batch_size: 16 |
|
|
+ weight_decay: 0.01 |
|
|
+ adam_beta1: 0.9 |
|
|
+ adam_beta2: 0.99 |
|
|
+ adam_epsilon: 1e-8 |
|
|
+ max_grad_norm: 1.0 |
|
|
+ fp16: True |
|
|
+ early_stopping: True |
|
|
|
|
|
#### Learning rates |
|
|
|
|
|
``` |
|
|
# Define separate learning rates |
|
|
lr_bert = 2e-5 # Learning rate for BERT layers |
|
|
lr_classifier = 1e-3 # Learning rate for the classifier |
|
|
|
|
|
optimizer = torch.optim.AdamW([ |
|
|
{"params": model.bert.parameters(), "lr": lr_bert}, # BERT layers |
|
|
{"params": model.classifier.parameters(), "lr": lr_classifier}, |
|
|
{"params": model.pre_classifier.parameters(), "lr": lr_classifier}, |
|
|
]) |
|
|
``` |
|
|
|
|
|
## Evaluation on test set |
|
|
|
|
|
|
|
|
 |
|
|
|
|
|
## Citation |
|
|
|
|
|
Please refer to this repo when using the model. |