# %%
import mlflow

from ml_utils.proxies import set_proxies
from ml_utils.tracking import get_runs

# %% Benchmark
uri = "azureml://db17de6f-6cb8-4996-849f-3fdf0d10a4b9.workspace.westeurope.api.azureml.ms/mlflow/v2.0/subscriptions/dec849c6-3664-4372-9569-749eb6820434/resourceGroups/rg-VLProd/providers/Microsoft.MachineLearningServices/workspaces/mlw-deeplearning1h7w"
exp_name = "paper-aaai26_eval"
job_name = "250725124713_aaai26"
with set_proxies():
    mlflow.set_tracking_uri(uri)
    run = mlflow.get_run(job_name)
metrics = run.data.metrics


# %% Experiment
exp_name = "clcp_hybrid-encoders"
df_run, df_metrics = get_runs(uri, exp_name, with_hist=True)
df_metrics = df_metrics.drop_duplicates(subset=["key", "step"], keep="last")
df_run = df_run[~df_run["run_id"].str.startswith("250624125651")]
df_metrics = df_metrics[~df_metrics["run_id"].str.startswith("250624125651")].sort_values(["run_id", "key", "step"])

df_run.to_csv("experiment_run", index=False)
df_metrics.to_csv("experiment_metrics", index=False)


# %% Prompt
prompt = """
I need help analyzing the results of a training run for a zero-shot classifier.

The benchmark CSV contains results from off-the-shelf models (cross-encoders and both trained and untrained dual encoders).

The experiment_run CSV contains MLflow logs from my training run with general info and the final metrics, whereas the experiment_metrics CSV contains the full history.

Training details:

The model is trained on NLI datasets reformulated to binary classification (neutral labels collapsed into contradiction).

Evaluation is performed on both NLI and classification (clf) test sets.

Clf test sets are binarized by turning each label into a premise-hypothesis pair and later cast back into a multi-class setup during evaluation.

During training, I track MNLI-MM performance for early stopping. Other metrics (on all test sets) are evaluated only at the start and end of training.
"""
