###1111111111Replace with your own file or folder path###
BASE_PATH="[PATH_to_THIS_REPO]"

cd ${BASE_PATH}/data_construction

# VLLM configuration
export VLLM_WORKER_MULTIPROC_METHOD=spawn
export VLLM_ALLOW_LONG_MAX_MODEL_LEN=1
export CUDA_VISIBLE_DEVICES=0,1,2,3
GPU_NUMS=4

# data
DATA_DIR="./data"
RAW_DATA_DIR="${DATA_DIR}/my_qa/raw"
PROCESSING_DATA_DIR="${DATA_DIR}/my_qa/processing"
LLAMA_FACTORY_DATA_PATH="${DATA_DIR}/llamafactory_format"
mkdir -p ${PROCESSING_DATA_DIR}
mkdir -p ${LLAMA_FACTORY_DATA_PATH}


###222222222222Replace with your own file or folder path###
MODEL_DIR_PATH="[PATH_TO_YOUR_MODEL]"
MODEL_NAME="Qwen2.5-7B-Instruct-wo-yarn"

###33333333333Replace with your own file or folder path###
RAW_MUSIQUE_TRAIN_FILE_PATH="[PATH_TO_YOUR_MUSIQUE]"
# RAW_MUSIQUE_TRAIN_FILE_PATH="${RAW_DATA_DIR}/musique_ans_v1.0_train_example.jsonl"
# TGT Long-Context Length
TGT_LENGTH_LIST=(8)
# Short Context
SHORT_DATA_FILE_NAME="musique_short_context_example"
########### 0. Create Short Context dataset ###########
# Since the supporting texts in the MuSiQue dataset are relatively short, 
# we still need to randomly add some irrelevant texts to expand the average length to 1K.

python ./0_create_short_context_data.py \
    --tokenizer_path "${MODEL_DIR_PATH}/${MODEL_NAME}" \
    --src_data_path ${RAW_MUSIQUE_TRAIN_FILE_PATH} \
    --tgt_save_path "${RAW_DATA_DIR}/${SHORT_DATA_FILE_NAME}.jsonl"

# You will find file `musique_short_context_example.jsonl` in path `code_for_SoLoPO/data_construction/data/my_qa/raw`.


########### 1. Sampling based on short context ###########
TEST_DATA_FILE_NAME=${SHORT_DATA_FILE_NAME}
TEMPLATE_PATH="./template/cot.json"
INFEE_MODE="all_text"
TEMPERATURE=0.85

python ./1_generate_pairs.py \
    --model_name_or_path "${MODEL_DIR_PATH}/${MODEL_NAME}" \
    --test_dataset_path "${RAW_DATA_DIR}/${TEST_DATA_FILE_NAME}.jsonl" \
    --tgt_save_dir_path ${PROCESSING_DATA_DIR} \
    --template_path ${TEMPLATE_PATH} \
    --inference_mode ${INFEE_MODE} \
    --tensor_parallel_size  ${GPU_NUMS} \
    --temperature ${TEMPERATURE}

# Default output file name
STAGE_1_OUTPUT_FILE_NAME="${TEST_DATA_FILE_NAME}_infer_with_${INFEE_MODE}_by_${MODEL_NAME}.jsonl"
# You will find this file in path `code_for_SoLoPO/data_construction/data/my_qa/processing`.


############ 2. preference pair selection ###########

SELECTOR="EM"
python ${BASE_PATH}/data_construction/2_select_preference_pairs.py \
    --src_data_dir_path ${PROCESSING_DATA_DIR} \
    --src_file_name ${STAGE_1_OUTPUT_FILE_NAME} \
    --evaluation_metric ${SELECTOR} \

# Default output file name
STAGE_2_OUTPUT_FILE_NAME="${TEST_DATA_FILE_NAME}_infer_with_${INFEE_MODE}_by_${MODEL_NAME}_all_${SELECTOR}_selector_success.jsonl"
# You will find this file in path `code_for_SoLoPO/data_construction/data/my_qa/processing`.


############ 3. Synthesize long context data ###########
# 3.1 Match the original MuSiQue data (containing more irrelevant text) for the data which successfully obtained preference pairs to synthesize long contexts
REF_FILE_PATH="${PROCESSING_DATA_DIR}/${STAGE_2_OUTPUT_FILE_NAME}"
SRC_FILE_PATH=${RAW_MUSIQUE_TRAIN_FILE_PATH}

python ${BASE_PATH}/data_construction/3_1_prepare_raw_data.py \
    --ref_file_path ${REF_FILE_PATH} \
    --src_file_path ${SRC_FILE_PATH}

# Default output file name
STAGE_3_1_OUTPUT_FILE_NAME="${TEST_DATA_FILE_NAME}_infer_with_${INFEE_MODE}_by_${MODEL_NAME}_all_${SELECTOR}_selector_success_raw.jsonl"
# You will find this file in path `code_for_SoLoPO/data_construction/data/my_qa/processing`.

# 3.2 Synthesize long-context data based on the approach of RULER
NUM_SANMPLES=$(wc -l < ${PROCESSING_DATA_DIR}/${STAGE_3_1_OUTPUT_FILE_NAME})
# echo "Total samples: $NUM_SANMPLES"

for TGT_LENGTH in "${TGT_LENGTH_LIST[@]}";do
    MAX_LENGTH=$((TGT_LENGTH * 1024))
    SAVE_PATH="${PROCESSING_DATA_DIR}/${TGT_LENGTH}k"
    python ${BASE_PATH}/data_construction/3_2_expand_context.py \
        --save_dir ${SAVE_PATH} \
        --save_name ${STAGE_3_1_OUTPUT_FILE_NAME} \
        --tokenizer_path "${MODEL_DIR_PATH}/${MODEL_NAME}" \
        --tokenizer_type "hf" \
        --max_seq_length ${MAX_LENGTH} \
        --tokens_to_generate 32 \
        --num_samples ${NUM_SANMPLES} \
        --src_data_dir_path ${PROCESSING_DATA_DIR} \
        --dataset ${STAGE_3_1_OUTPUT_FILE_NAME} \
        --template="Write an accurate and concise answer for the given question using only the provided search results (some of which might be irrelevant). Start with an accurate, engaging, and concise explanation based only on the provided documents. Must end with 'The answer is:'. Use an unbiased and journalistic tone. If the question cannot be answered, end with 'The answer is: No answer'.\n\n{context}\n\nQuestion: {query}  Think step-by-step.\n\nAnswer:"
done
# Default output file name: ${STAGE_3_1_OUTPUT_FILE_NAME}
# You will find this file in path `code_for_SoLoPO/data_construction/data/my_qa/processing/8k`.

############ 4. Create Short-to-Long Datasets ###########
for TGT_LENGTH in "${TGT_LENGTH_LIST[@]}";do
    python ${BASE_PATH}/data_construction/4_create_short2long_samples.py \
        --long_data_dir ${PROCESSING_DATA_DIR} \
        --long_data_name ${STAGE_3_1_OUTPUT_FILE_NAME} \
        --short_data_path "${PROCESSING_DATA_DIR}/${STAGE_2_OUTPUT_FILE_NAME}" \
        --long_text_length "${TGT_LENGTH}k"
done

# Default output file name
STAGE_4_OUTPUT_FILE_NAME="supporting_random_shor2long_${STAGE_3_1_OUTPUT_FILE_NAME}"
# You will find this file in path `code_for_SoLoPO/data_construction/data/my_qa/processing/8k`.
############ 5. Convert to Llama Factory format ###########
for TGT_LENGTH in "${TGT_LENGTH_LIST[@]}";do
    python ${BASE_PATH}/data_construction/5_convert_format.py \
        --src_data_file_name ${STAGE_4_OUTPUT_FILE_NAME} \
        --src_data_dir_path ${PROCESSING_DATA_DIR} \
        --tgt_save_data_dir_path ${LLAMA_FACTORY_DATA_PATH} \
        --template_path ${TEMPLATE_PATH} \
        --length_list "${TGT_LENGTH}k"
done

# You will find this file in path `code_for_SoLoPO/data_construction/data/my_qa/llamafactory_format`.