#!/bin/bash

<<<<<<< Updated upstream
# Check if env var is set
if [ -z "$RESEARCH_INTERACTIVE_TOOLKIT" ]; then
    # In toolkit
    DATA_PATH=/datadrive/data
else
    DATA_PATH=/mnt/cl_llm_data/datasets/open_instruct
fi

echo "Downloading the OpenAssistant data (oasst1)..."
wget -P ${DATA_PATH}/raw_train/oasst1/ https://huggingface.co/datasets/OpenAssistant/oasst1/resolve/main/2023-04-12_oasst_ready.trees.jsonl.gz
gzip -d ${DATA_PATH}/raw_train/oasst1/2023-04-12_oasst_ready.trees.jsonl.gz
=======
DATA_PATH=/datadrive/data
>>>>>>> Stashed changes

echo "Downloading Super-NaturalInstructions dataset..."
wget -P ${DATA_PATH}/raw_train/super_ni/ https://github.com/allenai/natural-instructions/archive/refs/heads/master.zip
unzip ${DATA_PATH}/raw_train/super_ni/master.zip -d ${DATA_PATH}/raw_train/super_ni/ && rm data/raw_train/super_ni/master.zip
mv ${DATA_PATH}/raw_train/super_ni/natural-instructions-master/* ${DATA_PATH}/raw_train/super_ni/ && rm -r ${DATA_PATH}/raw_train/super_ni/natural-instructions-master

echo "Downloading the flan_v2 chain-of-thought submix..."
wget -P ${DATA_PATH}/raw_train/cot/ https://beaker.org/api/v3/datasets/01GXZ52K2Q932H6KZY499A7FE8/files/cot_zsopt.jsonl
wget -P ${DATA_PATH}/raw_train/cot/ https://beaker.org/api/v3/datasets/01GXZ51ZV283RAZW7J3ECM4S58/files/cot_fsopt.jsonl

echo "Downloading the flan_v2 collection, here we subsampled only 100K instances..."
wget -P ${DATA_PATH}/raw_train/flan_v2/ https://beaker.org/api/v3/datasets/01GZTTS2EJFPA83PXS4FQCS1SA/files/flan_v2_resampled_100k.jsonl

echo "Downloading self-instruct data..."
wget -P ${DATA_PATH}/raw_train/self_instruct/ https://raw.githubusercontent.com/yizhongw/self-instruct/main/data/gpt3_generations/batch_221203/all_instances_82K.jsonl

echo "Downloading unnatural-instructions data..."
wget -P ${DATA_PATH}/raw_train/unnatural_instructions/ https://github.com/orhonovich/unnatural-instructions/raw/main/data/core_data.zip
unzip ${DATA_PATH}/raw_train/unnatural_instructions/core_data.zip -d ${DATA_PATH}/raw_train/unnatural_instructions/

echo "Downloading Stanford alpaca data..."
wget -P ${DATA_PATH}/raw_train/stanford_alpaca/ https://github.com/tatsu-lab/stanford_alpaca/raw/main/alpaca_data.json

echo "Downloading the dolly dataset..."
wget -P ${DATA_PATH}/raw_train/dolly/ https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/main/databricks-dolly-15k.jsonl


echo "Downloading the code alpaca dataset..."
wget -P ${DATA_PATH}/raw_train/code_alpaca/ https://github.com/sahil280114/codealpaca/raw/master/data/code_alpaca_20k.json

echo "Downloading the gpt4-llm dataset..."
wget -P ${DATA_PATH}/raw_train/gpt4_alpaca/ https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM/raw/main/data/alpaca_gpt4_data.json
wget -P ${DATA_PATH}/raw_train/gpt4_alpaca/ https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM/raw/main/data/alpaca_gpt4_data_zh.json

echo "Downloading the baize dataset..."
wget -P ${DATA_PATH}/raw_train/baize/ https://github.com/project-baize/baize-chatbot/raw/main/data/alpaca_chat_data.json
wget -P ${DATA_PATH}/raw_train/baize/ https://github.com/project-baize/baize-chatbot/raw/main/data/medical_chat_data.json
wget -P ${DATA_PATH}/raw_train/baize/ https://github.com/project-baize/baize-chatbot/raw/main/data/quora_chat_data.json
wget -P ${DATA_PATH}/raw_train/baize/ https://github.com/project-baize/baize-chatbot/raw/main/data/stackoverflow_chat_data.json

echo "Downloading ShareGPT dataset..."
wget -P ${DATA_PATH}/raw_train/sharegpt/ https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/HTML_cleaned_raw_dataset/sg_90k_part1_html_cleaned.json
wget -P ${DATA_PATH}/raw_train/sharegpt/ https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/HTML_cleaned_raw_dataset/sg_90k_part2_html_cleaned.json
