
model_name_or_path: /home/techdata-sjgx-nryf/huangxiang73/huggingface/Llama-3-8B-Instruct
special_tokens_num: 8
if_block_diag_attn: false
new_special_tokens: <emb_0>,<emb_1>,<emb_2>,<emb_3>,<emb_4>,<emb_5>,<emb_6>,<emb_7>,
use_fast_tokenizer: false
add_teacher_cl: false
if_freeze_layer: true
if_freeze_teacher: true
use_same_special: false
main_loss_type: MSE
#,<emb_1>,<emb_2>,<emb_3>,<emb_4>,<emb_5>,<emb_6>,<emb_7>,<emb_8>,<emb_9>,<emb_10>,<emb_11>,<emb_12>,<emb_13>,<emb_14>,<emb_15>,<emb_16>,<emb_17>,<emb_18>,<emb_19>,<emb_20>,<emb_21>,<emb_22>,<emb_23>,<emb_24>,<emb_25>,<emb_26>,<emb_27>,<emb_28>,<emb_29>,<emb_30>,<emb_31>,<emb_32>,<emb_33>,<emb_34>,<emb_35>,<emb_36>,<emb_37>,<emb_38>,<emb_39>,<emb_40>,<emb_41>,<emb_42>,<emb_43>,<emb_44>,<emb_45>,<emb_46>,<emb_47>,<emb_48>,<emb_49>,<emb_50>,<emb_51>,<emb_52>,<emb_53>,<emb_54>,<emb_55>,<emb_56>,<emb_57>,<emb_58>,<emb_59>,<emb_60>,<emb_61>,<emb_62>,<emb_63>


### method 
stage: mtp # 使用mtp训练流程
do_train: true
finetuning_type: full
deepspeed: examples/deepspeed/ds_z2_config.json

### dataset 
dataset: msmarco_instAligner
dataset_dir: /home/techdata-sjgx-nryf/gaotianhao/embedding/LLaMA-Factory-main/data
template: llama3
cutoff_len: 512
overwrite_cache: true
preprocessing_num_workers: 8

### output V1CLloss_noloss3.  NoattentionMask
output_dir: /media/cfs/gaotianhao1/embedding/fuxian/llama3-8b-Instruct-lr2e-5-1epoch-8Special-MSEloss-msmarco80k-leftShift-returnAttn-bz8gas1-512maxlen
logging_steps: 1
save_steps: 500
plot_loss: true
overwrite_output_dir: true
save_total_limit: 4

### train
per_device_train_batch_size: 8
gradient_accumulation_steps: 1
learning_rate: 5.0e-6
num_train_epochs: 1
lr_scheduler_type: cosine
warmup_ratio: 0.05
bf16: true
plot_loss: true
ddp_timeout: 180000000
report_to: tensorboard
max_grad_norm: 1.0
# resume_from_checkpoint: /media/cfs/gaotianhao1/embedding/outputs/llama3-8b-Instruct-lr5e-6-1epoch-multiSpecialTokenFull-8special-MSE+(4views-teacherAblation)loss-inbedder200k-freezeTeacher+24layers-returnAttn-bz8gas6/checkpoint-200

### eval
# metric_for_best_model: eval_loss
# load_best_model_at_end: true
# greater_is_better: false
# save_total_limit: 2
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 100
