name: PDE 
environment:
    image:
        gpu: # omitted for double-blind review
hyperparameters:
    dataset: PDE
    weight: roberta
    size: large
    add_text: cat
    pde_subset: 
    maxsamples: 128 
    target_seq_len: 128

    experiment_id: Large-noRD-cat-bs32
    seed: 0
    epochs: 100
    embedder_epochs: 20 
    predictor_epochs: 0
    finetune_method: all
    drop_out: 0

    batch_size: 32
    eval_batch_size: 1000
    accum: 1
    clip: -1
    validation_freq: 1

    optimizer:
        name: Adam
        params:
            lr: 0.00005
            betas: [0.9, 0.98]
            weight_decay: 0.00001
            momentum: 0.99

    scheduler:  
        name: WarmupLR 
        params:
            warmup_epochs: 5
            decay_epochs: 100
            sched: [20, 40, 50]
            base: 0.2

    no_warmup_scheduler:  
        name: StepLR
        params:
            warmup_epochs: 10
            decay_epochs: 100
            sched: [40, 80, 120]
            base: 0.2

    num_workers: 4
    reproducibility: False
    valid_split: False

min_validation_period:
    epochs: 1
bind_mounts:
    - host_path: /tmp
      container_path: /data
    - host_path: /tmp
      container_path: /root/.cache
resources:
    slots_per_trial: 1
records_per_epoch: 1562
searcher:
    name: single
    metric: "val score"
    smaller_is_better: true
    max_length:
        epochs: 1
max_restarts: 0
entrypoint: python3 main.py