import csv
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.path as mpath
from collections import defaultdict
import matplotlib.pyplot as plt
import json
def postprocess(filename):
    with open(filename, 'r') as file :
        filedata = file.read()
    filedata = filedata.replace('""","""', '"",""')
    with open(filename, 'w') as file:
        file.write(filedata)

postprocess("size_larger.csv")
postprocess("probe_larger.csv")
postprocess("new-gpt4-10.csv")
postprocess("new-gpt4-20.csv")
postprocess("new-gpt4-50.csv")
postprocess("new-gpt4-100.csv")

def process_benchmark_data(file_path):
    with open(file_path, 'r') as f:
        input_data = f.readlines()

    times_and_counts = []
    for line in input_data:
        if line.strip() and 'Benchmark' not in line:
            parts = line.split(',')
            if len(parts) > 2 and parts[2].strip():
                try:
                    time = float(parts[2])
                    times_and_counts.append((time, 1))
                except ValueError:
                    # If there is an issue with conversion, print the error and skip this line
                    print(f"Could not convert to float: {parts[2]}")
    times_and_counts.sort(key=lambda x: x[0])
    return times_and_counts

def prepend_data_if_necessary(dataset):
    # Check if the first time value is greater than 0 and prepend [0, 0] if true
    if dataset and dataset[0][0] > 0:
        dataset.insert(0, [0, 0])  # Prepend [0, 0] to make sure plot starts at x=0

def process_json_benchmark_data(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    print("file:", file_path)
    times_seconds = []
    fit_counts_of_1 = []
    for entry in data.values():
        # Convert time from milliseconds to seconds and store
        time_seconds = entry["time_ms"] / 1000
        times_seconds.append(time_seconds)
        # Store the count of fit_count_of_1 if it's greater than 0, indicating a correct solution
        fit_counts_of_1.append(1 if entry["fit_count_of_1"] > 0 else 0)
    # Sort by time
    sorted_data = sorted(zip(times_seconds, fit_counts_of_1), key=lambda x: x[0])
    return sorted_data
#file_path = 'string-grammar-completions_modified.json'

def prepend_data_if_necessary(dataset):
    if dataset and dataset[0][0] > 0:
        dataset.insert(0, [0, 0])

print("processing-probe-data:")
probe_data = process_benchmark_data('probe_larger.csv')
print("processing-size-data:")
size_data = process_benchmark_data('size_larger.csv')

llm_10 = process_benchmark_data('new-gpt4-10.csv')
llm_20 = process_benchmark_data('new-gpt4-20.csv')
llm_50 = process_benchmark_data('new-gpt4-50.csv')
llm_100 = process_benchmark_data('new-gpt4-100.csv')

#print("processing-init-string-data:")
#init_llm = process_benchmark_data('init_string.csv')
#print("processing-init-string-probe-data:")
#llm_probe = process_benchmark_data('init_probe_string.csv')
#print("only-constants data:")

size_data.append([600,0])
probe_data.append([600,0])
llm_10.append([600,0])
llm_20.append([600,0])
llm_50.append([600,0])
llm_100.append([600,0])

prepend_data_if_necessary(probe_data)
prepend_data_if_necessary(size_data)
prepend_data_if_necessary(llm_10)
prepend_data_if_necessary(llm_20)
prepend_data_if_necessary(llm_50)
prepend_data_if_necessary(llm_100)
secs = [0] * 69
secs.append(600)
print(secs)
total = [0] * 69
total.append(0)
plt.xlim([0,604])
plt.ylim(0,100)
plt.plot([x[0] for x in llm_100], np.cumsum(np.asarray([x[1] for x in llm_100])), 'r--', linewidth=2, label='HySynth-STRING-100', markersize = 4)
plt.legend(loc='best')
plt.plot([x[0] for x in llm_50], np.cumsum(np.asarray([x[1] for x in llm_50])), 'm--', linewidth=2, label='HySynth-STRING-50', markersize = 4)
plt.legend(loc='best')
plt.plot([x[0] for x in llm_20], np.cumsum(np.asarray([x[1] for x in llm_20])), 'b--', linewidth=2, label='HySynth-STRING-20', markersize = 4)
plt.legend(loc='best')
plt.plot([x[0] for x in llm_10], np.cumsum(np.asarray([x[1] for x in llm_10])), 'g--', linewidth=2, label='HySynth-STRING-10', markersize = 4)
plt.legend(loc='best', fontsize=12)
#plt.plot([x[0] for x in probe_data], np.cumsum(np.asarray([x[1] for x in probe_data])), 'c--', linewidth=2, label='Probe', markersize = 4)
#plt.legend(loc='best')
#plt.plot([x[0] for x in size_data], np.cumsum(np.asarray([x[1] for x in size_data])), 'k--', linewidth=2, label='Unguided', markersize = 4)
#plt.legend(loc='best')
#plt.plot(secs, total, 'y--', linewidth=3, label='GPT4o', markersize = 3)
#plt.legend(loc='best')
plt.xlabel('Time (Seconds)', fontsize=12)
plt.ylabel('Number of Benchmarks Solved', fontsize=13)
plt.title('Number of Problems Solved Against Time', fontsize=13)
plt.savefig('string_samples.pdf')