from .utility import load_dataframe, extract_instruction_apibench, extract_model_name_from_api_call, remove_fields_from_api_data
import pandas as pd
import re

# from utils.utility import (
#     load_json_file, save_json_file, validate_data_structure, 
#     create_metadata, get_data_stats
# )

def process_apibench_tf(raw_data_dir):
    """Process TensorFlow API benchmark data."""
    print("Processing TensorFlow API benchmark data...")

    train_df = load_dataframe(f"{raw_data_dir}/apibench/tensorflow_train.json", lines=True)
    eval_df = load_dataframe(f"{raw_data_dir}/apibench/tensorflow_eval.json", lines=True)
    processed_dfs = []
    
    for df in [train_df, eval_df]:
        # df["extracted_url"] = df['api_call'].str.extract(r"'(https://tfhub\.dev/[^']+)'")
        # unique_urls = df['extracted_url'].dropna().copy().unique()

        # url_to_date = {}

        # driver = create_robust_driver()

        # for url in unique_urls:

        #     try:
        #     driver.get(url)
        #     WebDriverWait(driver, 10).until(
        #             EC.presence_of_element_located((By.XPATH, "//*[contains(text(), 'Published On')]"))
        #         )

        #     spans = driver.find_elements(By.TAG_NAME, "span")

        #     for span in spans:
        #         text = span.text.strip()
        #         match = re.search(r'PUBLISHED ON (\d{4}\.\d{2}\.\d{2})', text)
        #         if match:
        #             date_str = match.group(1)
        #             date_obj = datetime.strptime(date_str, "%Y.%m.%d")
        #             timestamp = pd.Timestamp(date_obj, tz='UTC')
        #             datetime64_utc = pd.Series([timestamp]).astype("datetime64[ns, UTC]")[0]
        #             url_to_date[url] = datetime64_utc

        #     time.sleep(1)

        #     except (StaleElementReferenceException, TimeoutException):
        #     continue

        #     except Exception as e:
        #     raise e

        # driver.quit()

        # df["created_at"] = df["extracted_url"].map(url_to_date)
        # df = df.dropna(subset=['created_at']).copy()

        # We can't extact model_name from api_data because most of the time api_name field does not contain repo owner id
        # repo_id is needed because we need to check which of APIBench models are present in MLLM dataset
        # So we extract model_name from api_call instead
        # df["model_name"] = df["api_data"].apply(lambda x: x.get("api_name"))
        
        pattern = r"https://tfhub\.dev/([a-zA-Z0-9/_\-]+)/[0-9]+"
        df["model_name"] = df["api_call"].str.extract(pattern)
        df = df.dropna(subset=['model_name']).copy()
        
        df["created_at"] = pd.Timestamp("2023-05-25", tz="UTC")

        df['domain'] = df['api_data'].apply(lambda x: x.get('domain'))

        df['instruction'] = df['code'].apply(extract_instruction_apibench)
        df['explanation'] = df["code"].str.extract(
                r"<<<explanation>>>:(.*?)(?=<<<code>>>|$)",  # stops at <<<code>>> or end of string
                flags=re.S
            ).fillna("")

        df['original_dataset'] = "APIBench"
        df['model_source'] = "TensorflowHub"

        df['api_data'] = df['api_data'].apply(remove_fields_from_api_data)
        
        processed_dfs.append(df)
        
    print("Finished processing TensorFlow API benchmark data")
    return processed_dfs