# the name of the model to use; should be something like
#   gpt2-xl or gpt-neo-2.7B or huggyllama/llama-7b
#name_or_path: mistralai/Mixtral-8x7B-v0.1
name_or_path: mistralai/Mixtral-8x7B-v0.1

# the name of the tokenizer to use; if null, will use the tokenizer from the model
tokenizer_name_or_path: null

# override pre-trained weights (e.g., from SFT); optional
archive: null

# the name of the module class to wrap with FSDP; should be something like
#   e.g. GPT2Block, GPTNeoXLayer, LlamaDecoderLayer, etc.
block_name: MixtralDecoderLayer

# the dtype for the policy parameters/optimizer state
policy_dtype: bfloat16

# the mixed precision dtype if using FSDP; defaults to the same as the policy
fsdp_policy_mp: null

# the dtype for the reference model (which is used for inference only)
reference_dtype: float16
