parent: research/conditional/train/configs/baselines/gpt/dense/base.yaml
md5_parent_hash: 7973e22f5f019acdc658ba4aa5bd4d39
interactive_debug: true
cuda_visible: "0,1"
n_gpus: 2
time: "26:00:00"
params:
    fsdp_enabled: true
    mixed_precision: true
    mixed_precision_dtype: bfloat16
    flash_attention: true
    fsdp_modules_to_wrap: "TransformerBlock,EmbeddingLayer,PredictionHead"
    activation_checkpointing_modules: "TransformerBlock,EmbeddingLayer,PredictionHead"
    gradient_accumulation_steps: 2


    ff_mode: vanilla

    dataset_type: wikibook
    n_steps: 50
    learning_rate: 5e-4
    batch_size: 512
    name: fsdp_complete_example
    tags: [example]
    decoding_interval: 0
    save_weights_interval: 0
    logging_interval_heavy: 1000
    eval_interval: 1000
    init_type: truncated_normal
    init_scale: 1.0
