#!/bin/bash

MODELS_PATH=

# Check if env var is set
if [ -z "$RESEARCH_INTERACTIVE_TOOLKIT" ]; then
    # In toolkit
    DATA_PATH=/datadrive/data
else
    DATA_PATH=/mnt/cl_llm_data/datasets/open_instruct
fi

echo "Splitting the ShareGPT dataset..."
python scripts/split_sharegpt_conversations.py \
	--in-files ${DATA_PATH}/raw_train/sharegpt/sg_90k_part1_html_cleaned.json ${DATA_PATH}/raw_train/sharegpt/sg_90k_part2_html_cleaned.json \
	--out-file ${DATA_PATH}/raw_train/sharegpt/sharegpt_html_cleaned_and_split.json \
	--model-name-or-path $MODELS_PATH/meta_llama_Llama_2_7b_hf

echo "Reformatting the datasets..."
python scripts/reformat_datasets.py --raw_data_dir ${DATA_PATH}/raw_train/ --output_dir ${DATA_PATH}/processed/
