from .utility import load_dataframe, extract_instruction_torchhub, extract_repo_model_name_torchhub, remove_fields_from_api_data, combine_repo_model, get_explanation_torchhub
import pandas as pd

def process_apibench_th(raw_data_dir):
    """Process TorchHub API benchmark data."""
    print("Processing TorchHub API benchmark data...")

    train_df = load_dataframe(f"{raw_data_dir}/apibench/torchhub_train.json", lines=True)
    eval_df = load_dataframe(f"{raw_data_dir}/apibench/torchhub_eval.json", lines=True)
    processed_dfs = []
    
    for df in [train_df, eval_df]:
        # We can't extact model_name from api_data because most of the time api_name field does not contain repo owner id
        # repo_id is needed because we need to check which of APIBench models are present in MLLM dataset
        # So we extract model_name from api_call instead
        
        df[["repo_name", "model_name"]] = df["api_call"].apply(lambda row: pd.Series(extract_repo_model_name_torchhub(row)))
        df = df.dropna(subset=['model_name']).copy()

        # models = df["repo_name"].unique()
        # model_to_date = {}

        # for model in models:
        #     try:
        #     url = f"https://api.github.com/repos/{model}"
        #     response = requests.get(url, headers=headers)
        #     data = response.json()
        #     model_to_date[model] = data["created_at"]
        #     except Exception as e:
        #     print(f"Failed to get info for {model}: {e}")
        #     print("url:", url)
        #     continue
        #     time.sleep(1)

        # df["created_at"] = df["repo_name"].map(model_to_date)
        # df = df.dropna(subset=['created_at']).copy()
        # df["created_at"] = pd.to_datetime(df["created_at"]).dt.tz_localize(None)

        df['model_name'] = df.apply(lambda row: combine_repo_model(row['repo_name'], row['model_name']), axis=1)

        # df["model_name"] = df["api_data"].apply(lambda x: x.get("api_name"))
        df["created_at"] = pd.Timestamp("2023-05-25", tz="UTC")

        df['domain'] = df['api_data'].apply(lambda x: x.get('domain'))

        df["instruction"] = df["code"].apply(extract_instruction_torchhub)
        df['explanation'] = df['code'].apply(get_explanation_torchhub)
        df['original_dataset'] = "APIBench"
        df['model_source'] = "TorchHub"
        df['api_data'] = df['api_data'].apply(remove_fields_from_api_data)
        
        processed_dfs.append(df)

    print("Finished processing TorchHub API benchmark data")
    return processed_dfs