# the name of the model to use; should be something like
#   gpt2-xl or gpt-neo-2.7B or huggyllama/llama-7b
#name_or_path: allenai/OLMoE-1B-7B-0924
name_or_path: allenai/OLMoE-1B-7B-0924

# the name of the tokenizer to use; if null, will use the tokenizer from the model
tokenizer_name_or_path: null

# override pre-trained weights (e.g., from SFT); optional
archive: null

# the name of the module class to wrap with FSDP; should be something like
#   e.g. GPT2Block, GPTNeoXLayer, LlamaDecoderLayer, etc.
block_name: OlmoeDecoderLayer

# the dtype for the policy parameters/optimizer state
policy_dtype: float32

# the mixed precision dtype if using FSDP; defaults to the same as the policy
fsdp_policy_mp: bfloat16

# the dtype for the reference model (which is used for inference only)
reference_dtype: bfloat16
