library(jsonlite)
library(data.table)
library(ggplot2)
library(extrafont)
loadfonts(device = "pdf")

# Define the parameters
models <- c("gpt-4o", "claude-3-5-sonnet-20241022", "llama")
perspectives <- c("first", "user")
prompts <- c("cot", "direct")

# Loop over all combinations of model, perspective, prompt, and scenario
for (model in models) {
  for (perspective in perspectives) {
    for (prompt in prompts) {
      # Construct the JSON file path dynamically
      file_path <- sprintf("REDACTED",
                            model, perspective, prompt)
      
      # Check if the file exists before attempting to load it
      if (!file.exists(file_path)) {
        print(paste("Skipping missing file:", file_path))
        next
      }
      
      # Load the JSON data
      js1a <- fromJSON(file_path)
      
      # Convert to data.table and unnest nested data if necessary
      d1a <- data.table(tidyr::unnest(data.frame(js1a), cols = c(results)))
      
      # Identify the last 9 columns
      last_9_cols <- tail(names(d1a), 9)
      
      # Specify the additional column to keep
      extra_col <- "params.dist.X..a0..1..a1..0..prob"
      
      # Ensure the additional column exists in the dataset
      if (extra_col %in% names(d1a)) {
        # Select only the desired columns
        d1a_filtered <- d1a[, c(last_9_cols, extra_col), with = FALSE]
      } else {
        print("Error: Specified column does not exist in dataset. Skipping...")
        next
      }
      
      # Extract unique parameter combinations
      unique_combinations <- unique(d1a[, .(alphaL0, alphaS, w)])
      
      # Initialize tracking variables for the best correlation
      best_params <- NULL
      best_correlation <- -Inf
      
      # Loop over each unique combination of parameters
      for (i in 1:nrow(unique_combinations)) {
        
        # Extract current parameter values
        current_alphaL0 <- unique_combinations[i, alphaL0]
        current_alphaS <- unique_combinations[i, alphaS]
        current_w <- unique_combinations[i, w]
        
        # Extract corresponding subtable
        subtable <- d1a_filtered[alphaL0 == current_alphaL0 & 
                                    alphaS == current_alphaS & 
                                    w == current_w]
        
        # Ensure the necessary columns exist and are numeric before calculating correlation
        if ("value" %in% names(subtable) & extra_col %in% names(subtable)) {
          
          # Convert to numeric (in case columns are not already)
          subtable[, value := as.numeric(value)]
          subtable[, (extra_col) := as.numeric(get(extra_col))]  # Use `get()` to reference column dynamically
          
          # Compute correlation, handling NA values
          correlation <- cor(subtable$value, subtable[[extra_col]], use = "complete.obs")
          
          # Print correlation for verification
          print(paste("Correlation for", model, perspective, prompt, 
                      current_alphaL0, current_alphaS, current_w, ":", correlation))
          
          # Update best correlation if this one is higher
          if (!is.na(correlation) && correlation > best_correlation) {
            best_correlation <- correlation
            best_params <- list(model = model, perspective = perspective, prompt = prompt,
                                alphaL0 = current_alphaL0, alphaS = current_alphaS, w = current_w)
          }
        } else {
          print("Error: Required columns not found or not numeric in the subtable.")
        }
      }
      
      # Print the best correlation for this model-scenario-prompt combination
      if (!is.null(best_params)) {
        print(paste("Best correlation for", model, perspective, prompt, ":", best_correlation, 
                    "achieved with alphaL0 =", best_params$alphaL0, 
                    "alphaS =", best_params$alphaS, "w =", best_params$w))
        
        # Construct the output directory and ensure it exists
        output_dir <- sprintf("REDACTED", model)
        if (!dir.exists(output_dir)) {
          dir.create(output_dir, recursive = TRUE)
        }
        
        # Format correlation to 4 decimal places
        cor_str <- formatC(best_correlation, format = "f", digits = 4)
        
        # Construct output file paths with correlation in name
        output_csv <- sprintf("%s_%s_%s_best_webppl_corr%s.csv", output_dir, perspective, prompt, cor_str)
        output_json <- sprintf("%s_%s_%s_best_webppl_corr%s.json", output_dir, perspective, prompt, cor_str)
        
        # Filter the original table to keep only rows matching the best parameters
        d1a_best_filtered <- d1a_filtered[alphaL0 == best_params$alphaL0 & 
                                            alphaS == best_params$alphaS & 
                                            w == best_params$w]
        
        # Keep only the flat, final columns from val_vr to the model prediction
        start_col <- which(names(d1a_best_filtered) == "val_vr")
        end_col <- which(names(d1a_best_filtered) == "params.dist.X..a0..1..a1..0..prob")
        d1a_best_filtered <- d1a_best_filtered[, start_col:end_col, with = FALSE]
        
        # Save the filtered dataset to CSV and JSON
        d1a_best_filtered <- data.table(jsonlite::flatten(as.data.frame(d1a_best_filtered)))
        fwrite(d1a_best_filtered, output_csv)
        write_json(as.data.frame(d1a_best_filtered), output_json, pretty = TRUE, auto_unbox = TRUE)
        
        print(paste("Filtered data saved to:", output_csv))
        print(paste("Filtered data saved to:", output_json))
      } else {
        print(paste("No valid best parameters found for", model, perspective, prompt))
      }
    }
  }
}



