#!/bin/bash
yaml_file="./config/model_repo.yaml"
model_name="llava-next-1.6"
container_name="tgi-llava-next"

# Extract information from the YAML file
hf_endpoint=$(yq e ".models.\"${model_name}\".model_path" $yaml_file)
quant=$(yq e ".models.\"${model_name}\".quant" $yaml_file)
hf_models_location=$(yq e ".general.hf_models_location" $yaml_file) && hf_models_location="${hf_models_location/#\~/$HOME}"

echo "Hugging face models location: $hf_models_location"
echo "Model path: $hf_endpoint"



max_input_tokens=4096
max_total_tokens=$((max_input_tokens+1))
max_batch_total_tokens=$max_total_tokens
max_batch_size=1        # Important: set this to 1 because tgi will stress test and you can run out of GPU memory. See TGI notes.
num_cpus=4
num_shards=1
port=9555
cuda_memory_fraction=0.98

# Start the Docker container, mounting to the model repository
docker run --rm --entrypoint /bin/bash -itd \
  --name $container_name \
  -v $hf_models_location:/data \
  --gpus 'all' \
   --cpus=$num_cpus \
  -p $port:$port \
  -e HF_TOKEN=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 \
   ghcr.io/huggingface/text-generation-inference:latest \

# Check if the container started successfully
if [ $? -eq 0 ]; then
    echo "Container started successfully!"
else
    echo "Failed to start the container."
    exit 1
fi

# Deploy TGI server on the container

# obs: cannot use quantize and dtype at the same time
if [ -z "$quant" ] || [ "$quant" = "null" ]; then
    docker exec $container_name bash -c "text-generation-launcher \
            --model-id $hf_endpoint --num-shard $num_shards --dtype float16 --tokenizer-config-path $hf_endpoint\
             --max-batch-size $max_batch_size --max-input-tokens $max_input_tokens --max-total-tokens $max_total_tokens \
             --max-batch-total-tokens $max_batch_total_tokens \
             --trust-remote-code --cuda-memory-fraction $cuda_memory_fraction --port $port --master-port $port --master-addr localhost"
else
    docker exec $container_name bash -c "text-generation-launcher \
            --model-id $hf_endpoint --num-shard $num_shards --dtype float16 --tokenizer-config-path $hf_endpoint\
             --max-batch-size $max_batch_size --max-input-tokens $max_input_tokens --max-total-tokens $max_total_tokens \
             --max-batch-total-tokens $max_batch_total_tokens \
             --trust-remote-code --cuda-memory-fraction $cuda_memory_fraction --port $port --master-port $port --master-addr localhost"
fi

# Check if the text-generation-launcher command was successful
if [ $? -eq 0 ]; then
    echo "text-generation-launcher ran successfully!"
else
    echo "Failed to run text-generation-launcher."
    echo "Stopping the Docker container..."
    docker stop $container_name
    exit 1
fi



