'''
Database for laminate open hole test example.

Description of the specimen:

- The specimen is a Carbon Fibre Reinforced Plastic (CFRP) plate with a hole in the center.
- The plate is placed in the x-y plane, the thickness is in the z-direction.
- The dimension of the plate is l_x * l_y * t_z.
- The hole's radius is r_h, its center is at (l_x*rx_h, l_y*ry_h), where rx_h and ry_h are the relative coordinates of the hole's center.
- The plate is clamped at the y=0 face and loaded in compression at the y = l_y face.

Design variables:

- the relative x-coordinate of the hole's center: rx_h;
- the radius of the hole: r_h;
- percentage of the plies in the 0-degree orientation and (+/-)45-degree orientation: p_0, p_45;

Random variables (source of uncertainty):

- detailed stacking sequence of the laminate: delta;

Outputs (quantity of interest):

- stress components of the critical point in the laminate: S11, S22, S33, S12, S13, S23;
- LaRC05 maximum failure index: UVARM6;

Other outputs (not of interest):

- coordinates of the critical point: X, Y, Z;
- LaRC05 failure indices: UVARM1, UVARM2, UVARM3, UVARM4, UVARM5;
- failure mode: UVARM7;
- index of critical point: index;

'''
import os
import sys
sys.path.append('.')
path0 = os.path.dirname(sys.argv[0])
path_summary = os.path.join(path0, 'summary')

import pandas as pd
import numpy as np
import torch
from typing import Tuple, List
from torch.utils.data import Dataset
from torch.utils.data import DataLoader


FNAME_DATABASE = os.path.join(path0, 'data', 'OHT-complete-database-4000.dat')

# NAME_INPUT = ['xr_hole_center', 'r_hole', 'p45', 'p0']
NAME_INPUT = ['xr_hole_center', 'r_hole']
# NAME_OUTPUT = ['S11', 'S22', 'S33', 'S12', 'S13', 'S23', 'UVARM6']
NAME_OUTPUT = ['UVARM6']

CHECK_NAN_GRAD = False
torch.autograd.set_detect_anomaly(CHECK_NAN_GRAD)


class OHTDataset(Dataset):
    '''
    The laminate Open Hole Test database.
    
    Parameters
    ----------
    xs, ys : np.ndarray
        Input and output data.

    gpu_id : int
        GPU ID. If None or a negative integer, use CPU.
        
    x_min, x_max : float | np.ndarray | None
        The range of the input data.
        
    y_min, y_max : float | np.ndarray | None
        The range of the output data.

    scale_x : bool
        If True, scale the input data.
        
    Attributes
    ----------    
    num_samples : int
        Number of samples in the dataset.
        
    dim_input : int
        Dimension of the input.
        
    dim_output : int
        Dimension of the output.
        
    name_x, name_y : List[str] | None
        The names of the input and output data.
    
    X : torch.Tensor (num_samples, dim_input)
        Input data.
        
    Y : torch.Tensor (num_samples, dim_output)
        Output data.
    '''
    def __init__(self, xs: np.ndarray, ys: np.ndarray, gpu_id=None, 
                    x_min=None, x_max=None, y_min=None, y_max=None,
                    scale_x = False) -> None:
        
        self.name = 'OHTDataset'
        self.gpu_id = None if (gpu_id is None or gpu_id < 0) else gpu_id
        
        #* Initialize the dataset
        self.X_cpu = xs.copy()
        self.Y_cpu = ys.copy()
        
        self.num_samples = self.X_cpu.shape[0]
        self.dim_input = self.X_cpu.shape[1]
        self.dim_output = self.Y_cpu.shape[1]
        
        self.x_min = np.min(self.X_cpu, axis=0) if x_min is None else x_min
        self.x_max = np.max(self.X_cpu, axis=0) if x_max is None else x_max
        self.y_min = np.min(self.Y_cpu, axis=0) if y_min is None else y_min
        self.y_max = np.max(self.Y_cpu, axis=0) if y_max is None else y_max
        
        self.name_x = NAME_INPUT
        self.name_y = NAME_OUTPUT
            
        if scale_x:
            self.X_cpu = (self.X_cpu - self.x_min) / np.clip(self.x_max - self.x_min, 1e-8, None)
        
        self.X = torch.tensor(self.X_cpu).float()
        self.Y = torch.tensor(self.Y_cpu).float()
        
        if torch.cuda.is_available() and self.gpu_id is not None:
            self.X = self.X.to(self.gpu_id)
            self.Y = self.Y.to(self.gpu_id)

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]


def load_database(fname_database, name_input=NAME_INPUT, name_output=NAME_OUTPUT) -> Tuple[np.ndarray, np.ndarray, List[str]]:
    
    with open(fname_database, 'r') as f:
        
        lines = f.readlines()
        
        NAME_DATA = lines[0].split()[1:]
        
        data = []
        
        for line in lines[1:]:
            
            line = line.split()
            
            if len(line) == 0:
                continue
            
            data.append([float(x) for x in line])
            
        data = np.array(data)
    
    indexes_input  = [NAME_DATA.index(name) for name in name_input]
    indexes_output = [NAME_DATA.index(name) for name in name_output]
    
    X_raw = data[:, indexes_input]
    Y_raw = data[:, indexes_output]
    
    return X_raw, Y_raw, NAME_DATA

def get_OHT_datasets(num_total_samples=None, ratio_train_samples=0.8, 
                        seed=None, gpu_id=None, scale_x=False, fname_database=FNAME_DATABASE) -> Tuple[OHTDataset, OHTDataset]:
    '''
    Get the Open Hole Test datasets.
    
    Parameters
    ----------
    num_total_samples : int | None
        Number of total samples. If None, use all samples.
        
    ratio_train_samples : float
        Ratio of the training samples.
        
    seed : int | None
        Random seed for sampling the dataset.
        
    gpu_id : int
        GPU ID. If None or a negative integer, use CPU.
        
    scale_x : bool
        If True, scale the input data.

    Returns
    -------
    train_set : UCIDataset
        The training UCI dataset.
        
    test_set : UCIDataset | None
        The testing UCI dataset.
    '''
    
    #* Load the OHT dataset
    
    X_raw, Y_raw, _ = load_database(fname_database)
    num_instances = X_raw.shape[0]

    #* Calculate attributes of the dataset
    
    x_min = np.min(X_raw, axis=0)
    x_max = np.max(X_raw, axis=0)
    y_min = np.min(Y_raw, axis=0)
    y_max = np.max(Y_raw, axis=0)

    #* Split the dataset [X_raw, Y_raw] into [X_train, Y_train] and [X_test, Y_test]
    
    if seed is not None:
        np.random.seed(seed)
    
    if num_total_samples is None:
        num_total_samples = num_instances
    else:
        num_total_samples = min(num_total_samples, num_instances)
    
    ratio_train_samples = min(1.0, max(0.0, ratio_train_samples))
    num_train_samples = int(num_total_samples * ratio_train_samples)
    
    indexes = np.random.permutation(num_instances)
    train_indexes = indexes[:num_train_samples]
    test_indexes = indexes[num_train_samples:num_total_samples]
    
    X_train = X_raw[train_indexes]
    Y_train = Y_raw[train_indexes]
    X_test = X_raw[test_indexes]
    Y_test = Y_raw[test_indexes]
    
    #* Create the UCI datasets
    
    train_set = OHTDataset(X_train, Y_train, gpu_id, x_min, x_max, y_min, y_max, scale_x)
    
    if ratio_train_samples < 1.0:
        test_set = OHTDataset(X_test, Y_test, gpu_id, x_min, x_max, y_min, y_max, scale_x)
    else:
        test_set = None

    return train_set, test_set

def prepare_case( 
                batch_size: int=512,
                seed: int|None = None,
                GPU_ID: int = 0,
                max_samples: int = None,
                ) -> dict:
    '''
    Prepare the test case, including:
    - Create folders
    - Load the dataset
    - Split the dataset into train and test sets
    '''

    os.makedirs(path_summary, exist_ok=True)

    train_set, test_set = get_OHT_datasets(
            num_total_samples=max_samples,
            ratio_train_samples=0.8,
            seed=seed, gpu_id=GPU_ID, scale_x=True,
            fname_database=FNAME_DATABASE)

    dataloader = DataLoader(train_set, batch_size=batch_size, shuffle=True, drop_last=False)

    X_train_tensor = train_set.X
    y_train_tensor = train_set.Y
    X_test_tensor = test_set.X
    y_test_tensor = test_set.Y

    return {
        'train_set': train_set,
        'test_set': test_set,
        'dataloader': dataloader,
        'X_train_tensor': X_train_tensor,
        'y_train_tensor': y_train_tensor,
        'X_test_tensor': X_test_tensor,
        'y_test_tensor': y_test_tensor,
        'dim_input': train_set.dim_input,
        'dim_output': train_set.dim_output,
    }


#* ==============================
#* Functions for parallel processing
#* ==============================

def set_seed(seed: int):
    '''
    Set the random seed for reproducibility.
    '''
    np.random.seed(seed)
    
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.enabled = False

def fname_summary(model_name: str) -> str:
    return os.path.join(path_summary, 'summary-%s-laminate.csv'%(model_name))

def assign_gpu(idx, num_gpus):
    """Assign a GPU ID based on the job index for round-robin assignment"""
    if num_gpus == 0:  # No GPUs available
        return 0
    return idx % num_gpus

def case_already_run(model_name, i_seed, n_train_sample):
    """
    Check if a specific case has already been run by looking at the summary file
    
    Args:
        model_name: Name of the model
        i_seed: Seed index
        
    Returns:
        Boolean indicating if the case has been run
    """
    fname = fname_summary(model_name)
    
    if not os.path.exists(fname):
        return False
    
    try:
        # Check if file is empty
        if os.path.getsize(fname) == 0:
            return False
            
        # Read the file as text first to check content
        with open(fname, 'r') as f:
            content = f.read().strip()
            if not content or ',' not in content:  # If file is empty or doesn't have CSV format
                return False
        
        # Use pandas to read the CSV safely with specific settings
        # Note: 'error_bad_lines' was renamed to 'on_bad_lines' in newer pandas versions
        try:
            # First try with the newer parameter name
            df = pd.read_csv(fname, sep=',', skipinitialspace=True, comment='#', 
                         skip_blank_lines=True, on_bad_lines='warn')
        except TypeError:
            # Fall back to older pandas versions
            df = pd.read_csv(fname, sep=',', skipinitialspace=True, comment='#', 
                         skip_blank_lines=True, error_bad_lines=False)
        
        # Check if required columns exist
        if 'n_sample' not in df.columns or 'seed' not in df.columns:
            print(f"Warning: Missing required columns in {fname}")
            return False
        
        # Check if this specific case exists in the results
        mask = (df['seed'] == i_seed) & (df['n_sample'] == n_train_sample)
        case_exists = mask.any()
        
        if case_exists:
            print(f"Found existing run for {model_name}, seed={i_seed}")
            
        return case_exists
        
    except Exception as e:
        print(f"Error checking if case already run: {e} (file: {fname})")
        # If there's any error reading the file, assume the case hasn't been run
        return False

