import pandas as pd
import yaml
import json
from datetime import datetime
import os
import sys

config_path = sys.argv[1]

# Load YAML configuration
with open(config_path, "r") as f:
    config = yaml.safe_load(f)

benchmarks = config["benchmarks"]
base_dir = config["base_dir"]
target_logp = config["target_logp"]
datasets = config["datasets"]

# Load data
experiment_name = config.get("experiment_name", "default")
benchmark_data = {name: pd.read_csv(base_dir + file) for name, file in benchmarks.items()}
datasets = {name: pd.read_parquet(file) for name, file in datasets.items()}
training_data = datasets.get("training_data")

# Create a unique log filename using experiment name and timestamp
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
log_filename = f"logs/{experiment_name}_{timestamp}.yaml"
os.makedirs("logs", exist_ok=True)  # Ensure logs directory exists


def compute_mae(df, target_logp):
    return abs(target_logp - df['clogp_value']).mean()

def compute_min_mae(df, target_logp):
    return abs(target_logp - df['clogp_value']).min()

def compute_best_100_mae(df, target_logp):
    df['logp_diff'] = abs(target_logp - df['clogp_value'])
    best_100 = df.nsmallest(100, 'logp_diff')
    return best_100['logp_diff'].mean()

def compute_generative_efficiency(df, training_data):
    """
    Computes generative efficiency metrics for a given dataframe.
    Returns the generative efficiency rate.
    """
    df['is_novel'] = df['smiles_string'].apply(lambda x: x not in training_data['smiles'].values)
    df['not_unique'] = df['smiles_string'].duplicated(keep=False)
    df['generative_efficiency'] = df.apply(lambda row: row['valid_smiles'] and row['is_novel'] and not row['not_unique'], axis=1)
    return df['generative_efficiency'].mean()

# Function to log results
def convert_numpy_types(obj):
    """Recursively converts NumPy data types to standard Python types."""
    if isinstance(obj, dict):
        return {key: convert_numpy_types(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_numpy_types(value) for value in obj]
    elif isinstance(obj, (pd.Series, pd.DataFrame)):
        return obj.to_dict()  
    elif hasattr(obj, "item"):  
        return obj.item()
    return obj  

def save_results(results, config, log_file):
    """Saves evaluation results along with the config to a YAML file with proper serialization."""
    log_data = {
        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "experiment_name": config.get("experiment_name", "default_experiment"),
        "config": config,
        "results": convert_numpy_types(results)  # Convert NumPy types before saving
    }
    with open(log_file, "w") as f:
        yaml.dump(log_data, f, default_flow_style=False)
    print(f"\nResults logged in {log_file}")

# Compute results
min_mae_results = {name: compute_min_mae(df, target_logp) for name, df in benchmark_data.items()}
best_100_mae_results = {name: compute_best_100_mae(df, target_logp) for name, df in benchmark_data.items()}
mae_results = {name: compute_mae(df, target_logp) for name, df in benchmark_data.items()}
generative_efficiency_results = {name: compute_generative_efficiency(df, training_data) for name, df in benchmark_data.items()}

# Print results
print("\n------------------------- MinMAE Results -------------------------")
for name, min_mae in min_mae_results.items():
    print(f"{name.capitalize()} MinMAE of LOGP: {min_mae}")

print("\n------------------------- Best100MAE Results -------------------------")
for name, best_100_mae in best_100_mae_results.items():
    print(f"{name.capitalize()} Best100MAE of LOGP: {best_100_mae}")

print("\n------------------------- MAE Results -------------------------")
for name, mae in mae_results.items():
    print(f"{name.capitalize()} MAE of LOGP: {mae}")

print("\n------------------------- Generative Efficiency Results -------------------------")
for name, generative_efficiency in generative_efficiency_results.items():
    print(f"{name.capitalize()} Generative Efficiency Rate: {generative_efficiency}")

# Prepare structured log data
log_results = {
    "MinMAE": min_mae_results,
    "Best100MAE": best_100_mae_results,
    "MAE": mae_results,
    "GenerativeEfficiency": generative_efficiency_results
}

# Save results to log file
save_results(log_results, config, log_filename)