###1111111111Replace with your own file or folder path###
# BASE_PATH="[PATH_to_THIS_REPO]"

cd ${BASE_PATH}/data_construction

# VLLM configuration
export VLLM_WORKER_MULTIPROC_METHOD=spawn
export VLLM_ALLOW_LONG_MAX_MODEL_LEN=1
export CUDA_VISIBLE_DEVICES=0,1,2,3
GPU_NUMS=4

# data
DATA_DIR="./data"
RAW_DATA_DIR="${DATA_DIR}/my_qa/raw"
PROCESSING_DATA_DIR="${DATA_DIR}/my_qa/processing"
LLAMA_FACTORY_DATA_PATH="${DATA_DIR}/llamafactory_format"
mkdir -p ${PROCESSING_DATA_DIR}
mkdir -p ${LLAMA_FACTORY_DATA_PATH}


###222222222222Replace with your own file or folder path###
MODEL_DIR_PATH="[PATH_TO_YOUR_MODEL]"
MODEL_NAME="Qwen2.5-7B-Instruct-wo-yarn"

###33333333333Replace with your own file or folder path###
RAW_MUSIQUE_TRAIN_FILE_PATH="[PATH_TO_YOUR_MUSIQUE]"
# RAW_MUSIQUE_TRAIN_FILE_PATH="${RAW_DATA_DIR}/musique_ans_v1.0_train_example.jsonl"

# Long Context
TGT_LENGTH=8
# You can directly use the long text data synthesized in the previous stage, 
# for example./data_construction/data/my_qa/processing/8k/musique_short_context_example_infer_with_all_text_by_Qwen2.5-7B-Instruct-wo-yarn_all_EM_selector_success_raw.jsonl.
SHORT_DATA_FILE_NAME="8k_musique_reallong_context_example"

########### 1. Sampling based on long context ###########
TEST_DATA_FILE_NAME=${SHORT_DATA_FILE_NAME}
TEMPLATE_PATH="./template/cot.json"
INFEE_MODE="all_text"
TEMPERATURE=0.85

python ./1_generate_pairs.py \
    --model_name_or_path "${MODEL_DIR_PATH}/${MODEL_NAME}" \
    --test_dataset_path "${RAW_DATA_DIR}/${TEST_DATA_FILE_NAME}.jsonl" \
    --tgt_save_dir_path ${PROCESSING_DATA_DIR} \
    --template_path ${TEMPLATE_PATH} \
    --inference_mode ${INFEE_MODE} \
    --tensor_parallel_size  ${GPU_NUMS} \
    --temperature ${TEMPERATURE}

# Default output file name
STAGE_1_OUTPUT_FILE_NAME="${TEST_DATA_FILE_NAME}_infer_with_${INFEE_MODE}_by_${MODEL_NAME}.jsonl"
# You will find this file in path `code_for_SoLoPO/data_construction/data/my_qa/processing`.


############ 2. preference pair selection ###########

SELECTOR="EM"
python ${BASE_PATH}/data_construction/2_select_preference_pairs.py \
    --src_data_dir_path ${PROCESSING_DATA_DIR} \
    --src_file_name ${STAGE_1_OUTPUT_FILE_NAME} \
    --evaluation_metric ${SELECTOR} \

# Default output file name
STAGE_2_OUTPUT_FILE_NAME="${TEST_DATA_FILE_NAME}_infer_with_${INFEE_MODE}_by_${MODEL_NAME}_all_${SELECTOR}_selector_success.jsonl"
# You will find this file in path `code_for_SoLoPO/data_construction/data/my_qa/processing`.


# # Default output file name
# # You will find this file in path `code_for_SoLoPO/data_construction/data/my_qa/processing/8k`.
# ############ 3. Convert to Llama Factory format ###########
python ${BASE_PATH}/data_construction/5_convert_format.py \
    --src_data_file_name ${STAGE_2_OUTPUT_FILE_NAME} \
    --src_data_dir_path ${PROCESSING_DATA_DIR} \
    --tgt_save_data_dir_path ${LLAMA_FACTORY_DATA_PATH} \
    --template_path ${TEMPLATE_PATH} \
    --length_list "${TGT_LENGTH}k" \
    --convert_type "reallong"
# # You will find this file in path `code_for_SoLoPO/data_construction/data/my_qa/llamafactory_format`.