pbelcak
/

UltraFastBERT-1x11-long

Model card Files Files and versions

UltraFastBERT-1x11-long / config.json

pbelcak's picture

Upload 11 files

8c43fc6 almost 2 years ago

history blame contribute delete

1.6 kB

	{
	"arch": {
	"architectures": [
	"ScriptableCrammedBERT"
	],
	"attention": {
	"causal_attention": false,
	"dropout_prob": 0.1,
	"num_attention_heads": 12,
	"qkv_bias": false,
	"rotary_embedding": false,
	"seq_op_in_fp32": false,
	"sequence_op": "torch-softmax",
	"skip_output_projection": false,
	"type": "self-attention"
	},
	"classification_head": {
	"classifier_dropout": 0.1,
	"head_dim": 1024,
	"include_ff_layer": true,
	"nonlin": "Tanh",
	"pooler": "zero_index"
	},
	"decoder_bias": false,
	"embedding": {
	"dropout_prob": 0.1,
	"embedding_dim": 768,
	"max_seq_length": 128,
	"normalization": true,
	"pad_token_id": 0,
	"pos_embedding": "scaled-sinusoidal",
	"stable_low_precision": false,
	"vocab_size": 32768
	},
	"ffn_layer_frequency": 1,
	"final_norm": true,
	"hidden_dropout_prob": 0.1,
	"hidden_size": 768,
	"init": {
	"std": 0.02,
	"type": "normal"
	},
	"intermed_depth": 11,
	"intermed_size": 1,
	"intermed_type": "fff",
	"loss": "cross-entropy",
	"nonlin": "GELU",
	"norm": "LayerNorm",
	"norm_eps": 1e-12,
	"norm_scheme": "pre",
	"num_labels": null,
	"num_transformer_layers": 16,
	"objective_layout": "MLM",
	"skip_head_transform": true,
	"sparse_prediction": 0.25,
	"tie_weights": true,
	"use_bias": false
	},
	"architectures": [
	"ScriptableLMForPreTraining"
	],
	"model_type": "crammedBERT",
	"torch_dtype": "float32",
	"transformers_version": "4.34.0"
	}