File size: 1,791 Bytes
0af2e59 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
ngpus: 4
type: aligned
gradient_accumulation_steps: 2
tokenizer:
tokens: 50257
model: gpt2
training:
batch_size: 128
accum: ${gradient_accumulation_steps}
n_iters: 1250000
snapshot_freq: 10000
log_freq: 500
eval_freq: 10000
snapshot_freq_for_preemption: 3000
snapshot_sampling: true
ema: 0.9999
warmup_iter: -1
loss_type: hybrid
epsilon: 0.0
lambda: 0.0
data:
train: openwebtext-train
valid: wikitext103
cache_dir: /home/toolkit/research-diffcodegen/data
debug: false
graph:
type: absorb
gamma: 1.0
file: /home/toolkit/research-diffcodegen/data
report_all: false
expanded_sigma: true
noise:
type: loglinear
sigma_min: 0.0001
sigma_max: 2.0
ar_diffusion: false
expanded_sigma: ${graph.expanded_sigma}
sampling:
predictor: analytic
steps_per_level: 1
noise_removal: true
strategy: direct
strategy_param: 0.9
annealing:
type: none
efficient: false
width: 1024
tau: 1024
eval_tau: 1024
steps_per_level: ${sampling.steps_per_level}
sampling_method: sdlm
diffusion_loss_weight: 1.0
ce_loss_weight: 1.0
sampling_eps: 0.0001
attention:
context_type: block_causal
block_type: full
match_inference: false
eval:
batch_size: 16
perplexity: true
perplexity_batch_size: 8
optim:
weight_decay: 0.1
optimizer: AdamW
lr: 0.0002
beta1: 0.9
beta2: 0.95
eps: 1.0e-08
warmup: 10000
grad_clip: 1.0
scheduler: cosine
experiment:
name: MDLM
wandb_project: Hybrid-SDLM-ALIGNED
model:
name: HDLM
type: ddit
hidden_size: 768
cond_dim: 128
length: 1024
n_blocks: 12
n_heads: 12
dropout: 0.1
scale_by_sigma: false
transformer_sigma_conditioning: false
hybrid_sigma_embedding: false
post_process_logits: false
use_timestep_embedding: false
model_type: epsilon_hybrid
|