# parsing aubc.csv from detail.csv
import pickle
import pandas as pd
from io import StringIO
import numpy as np
from tqdm import tqdm

def read_csv_with_format_check(file_path, expected_fields=7, sep='|', header=None):
    with open(file_path, 'r') as file:
        content = file.read()

    # Split the content into lines and remove empty lines
    lines = [line.strip() for line in content.split('\n') if line.strip()]

    # Iterate through lines and check for format errors
    corrected_lines = []
    for line_num, line in enumerate(lines, start=1):
        fields = line.split(sep)  # Exclude the first and last empty fields
        if len(fields) != expected_fields:
            # Correct the format error based on the identified pattern
            # case 1: fields[0:3] == fields[3:6]
            if fields[1:3] == fields[4:6]:
                corrected_fields = fields[3:]
                assert len(corrected_fields) == expected_fields
                # corrected_line = f"|{'|'.join(corrected_fields)}|"
                # corrected_lines.append(corrected_line)
                corrected_lines.append(corrected_fields)
            elif len(fields) == 12:
                corrected_fields = fields[:6] + ['']
                assert len(corrected_fields) == expected_fields
                corrected_lines.append(corrected_fields)
                # corrected_line = f"|{'|'.join(corrected_fields)}|"
                # corrected_lines.append(corrected_line)
                corrected_fields = [''] + fields[6:]
                assert len(corrected_fields) == expected_fields
                corrected_lines.append(corrected_fields)
                # corrected_line = f"|{'|'.join(corrected_fields)}|"
                # corrected_lines.append(corrected_line)
            elif len(fields) == 10:
                continue
            elif len(fields) == 15:
                corrected_fields = fields[:6] + ['']
                assert len(corrected_fields) == expected_fields
                corrected_lines.append(corrected_fields)
                # corrected_line = f"|{'|'.join(corrected_fields)}|"
                # corrected_lines.append(corrected_line)
                corrected_fields = fields[8:]
                assert len(corrected_fields) == expected_fields
                corrected_lines.append(corrected_fields)
                # corrected_line = f"|{'|'.join(corrected_fields)}|"
                # corrected_lines.append(corrected_line)
            elif 'INFO' in fields[0]:
                corrected_fields = [''] + fields[1:] + ['']
                assert len(corrected_fields) == expected_fields
                corrected_lines.append(corrected_fields)
                # corrected_lines.append(corrected_line)
            elif len(fields) == 5:
                continue
            elif set(fields) == {''}:
                continue
            else:
                breakpoint()
        else:
            assert len(fields) == expected_fields
            corrected_lines.append(fields)
            # corrected_lines.append(line)

    # Create a new CSV file with corrected lines
    # corrected_content = '\n'.join(corrected_lines)
    # corrected_csv = pd.read_csv(StringIO(corrected_content), sep='|', header=None)
    corrected_csv = pd.DataFrame(corrected_lines)
    # replace '' to np.nan
    corrected_csv = corrected_csv.replace('', None)
    # convert column 3 to float
    corrected_csv[3] = corrected_csv[3].astype(float)
    # remove rows with invalid number, its column 3 is not in [0.5, 1]
    corrected_csv = corrected_csv[corrected_csv[3].between(0.5, 1)]
    # convert column 2 to int
    corrected_csv[2] = corrected_csv[2].astype(int)
    return corrected_csv


qs_dict = {
    'uniform': 'Uniform',
    'qbc': 'QBC', 'hintsvm': 'HintSVM', 'quire': 'QUIRE', 'albl': 'ALBL', 'dwus': 'DWUS', 'kcenter': 'Core-Set',  # libact
    'margin': 'US', 'graph': 'Graph', 'hier': 'Hier', 'mcm': 'MCM',  # google
    'lal': 'LAL',  'bmdr': 'BMDR', # alipy
    'skactiveml_bald': 'BALD',  # scikit-activeml
}
ordered_qs = ['Uniform', 'US', 'QBC', 'BALD', 'Hier', 'Graph',
              'Core-Set', 'HintSVM', 'QUIRE',  'DWUS', 'MCM', 'BMDR', 'ALBL', 'LAL', ]
small_data_list = ["appendicitis", "sonar", "parkinsons", "ex8b", "heart", "haberman", "ionosphere", "clean1",
                   "breast", "wdbc", "australian", "diabetes", "mammographic", "ex8a", "tic", "german",
                   "splice", "gcloudb", "gcloudub", "checkerboard"]
large_data_list = ["spambase", "banana",
                   "phoneme", "ringnorm", "twonorm", "phishing"]
real_data_list = ['covertype', 'bioresponse', 'pol', ]
data_list = small_data_list + large_data_list + real_data_list
data_dict = {d: d.capitalize() for d in data_list}

file_prefix = 'detail/'
file_suffix = '-zhan-RandomForest-RandomForest-RS_noFix_scale-detail.csv'

# collect valid res_expno for each data as index for dataframe in qs_data_aubc_dict by qs = 'uniform'
valid_res_expno_dict = {}
for data in data_dict:
    valid_res_expno_dict[data] = None
    # sort index from small to large
    uniform_data_aubc_curr = pd.read_csv(
        f'aubc/{data}-uniform-zhan-RandomForest-RandomForest-RS_noFix_scale-aubc.csv', index_col=0)
        # -zhan-RandomForest-RandomForest-RS_noFix_scale-aubc.csv
    if data in small_data_list:
        valid_res_expno_dict[data] = uniform_data_aubc_curr.index.values[:100]
    else:
        valid_res_expno_dict[data] = uniform_data_aubc_curr.index.values[:10]

# group by column 1 and calculate aubc by column 2 and column 3
qs_data_detail_dict = {}
for qs in tqdm(qs_dict):
    for data in data_dict:
        qs_data_detail_dict[(qs, data)] = None
        details_curr = None
        # try:
        try:
            details_curr = read_csv_with_format_check(
                f'{file_prefix}{data}-{qs}{file_suffix}', expected_fields=7, sep='|', header=None)
        except:
            print(f'No result {qs}, {data}.')
            continue
        details_curr = details_curr[[1, 2, 3]]
        details_curr[1] = details_curr[1].astype(int)
        details_curr = details_curr.loc[details_curr[1].isin(
            valid_res_expno_dict[data])]
        details_curr.columns = ['expno', 'round', 'res_tst_score']
        qs_data_detail_dict[(qs, data)] = details_curr

# export qs_data_detail_dict to pickle
with open('qss_datas_details_dict.pickle', 'wb') as handle:
    pickle.dump(qs_data_detail_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
