#!/bin/bash

# Script to align gpt2-base dataset with gpt2-xl dataset order
# This ensures both datasets have the same sample ordering for proper comparison

BASE_PATH=${1-"/home/spectrumKD"}

# Paths to the datasets
REFERENCE_DATASET="${BASE_PATH}/processed_data/dolly/full/gpt2/gpt2-xl/answers_with_metrics_gpt2-xl.jsonl"
TARGET_DATASET="${BASE_PATH}/processed_data/dolly/full/gpt2/gpt2-base/answers_with_metrics_gpt2-base.jsonl"
OUTPUT_PATH="${BASE_PATH}/processed_data/dolly/full/gpt2/gpt2-base/answers_with_metrics_gpt2-base_aligned.jsonl"

echo "Aligning gpt2-base dataset with gpt2-xl dataset order..."
echo "Reference dataset: ${REFERENCE_DATASET}"
echo "Target dataset: ${TARGET_DATASET}"
echo "Output path: ${OUTPUT_PATH}"

# Check if datasets exist
if [ ! -f "$REFERENCE_DATASET" ]; then
    echo "Error: Reference dataset not found: $REFERENCE_DATASET"
    exit 1
fi

if [ ! -f "$TARGET_DATASET" ]; then
    echo "Error: Target dataset not found: $TARGET_DATASET"
    exit 1
fi

# Run the alignment script
PYTHONPATH=${BASE_PATH} python3 ${BASE_PATH}/scripts/gpt2/tools/align_datasets.py \
    --reference-dataset "$REFERENCE_DATASET" \
    --target-dataset "$TARGET_DATASET" \
    --output-path "$OUTPUT_PATH" \
    --verify-alignment

echo ""
echo "Alignment complete!"
echo "Original gpt2-base dataset: $TARGET_DATASET"
echo "Aligned gpt2-base dataset: $OUTPUT_PATH"
echo ""
echo "Now both datasets have the same sample ordering for proper comparison."
