{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Convert COMSOL data to PARC data\n",
    "\n",
    "This notebook is used to convert COMSOL simulation data into tensors suitable for deep learning.\n",
    "It aggregates the data from a parametric sweep into tensors of shape (N_trajectories, N_timesteps, X, Y) for \n",
    "each feature.\n",
    "\n",
    "The input data (exported from COMSOL) is in the form of a csv file with the following columns:\n",
    "\n",
    "x0 | y0 | t1 | t2 | t3 | ... | tN\n",
    "\n",
    "x0 | y1 | t1 | t2 | t3 | ... | tN\n",
    "\n",
    "x0 | y2 | t1 | t2 | t3 | ... | tN\n",
    "\n",
    "x1 | y0 | t1 | t2 | t3 | ... | tN\n",
    "\n",
    "x1 | y1 | t1 | t2 | t3 | ... | tN\n",
    "\n",
    "x1 | y2 | t1 | t2 | t3 | ... | tN\n",
    "\n",
    "...\n",
    "\n",
    "xM | yN | t1 | t2 | t3 | ... | tN\n",
    "\n",
    "where t1, t2, t3, ... are the time steps.\n",
    "\n",
    "The tensor data is then stored as hdf5 files with the [format used by the well](https://polymathic-ai.org/the_well/data_format/).\n",
    "\n",
    "Depending on the simulation, different fields and parameters must be stored.\n",
    "If the geometry is randomized (e.g. different obstacles for fluid flow), it is considered as initial conditions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "from pathlib import Path\n",
    "from typing import Dict, Tuple, Any, Optional\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import h5py\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "def search_comsol_data(data_path: Path, params: list[str], fields: list[str], seed_name: Optional[int]=None) -> Dict[Tuple[int, int, float], Dict[str, Any]]:\n",
    "    \"\"\"\n",
    "    Search through velocities, pressure, and phase_boundary folders to find and group\n",
    "    files with matching parameter combinations.\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    data_path : Path\n",
    "        Base path containing the subdirectories\n",
    "    params : list[str]\n",
    "        List of parameter names to extract from the filenames (e.g. strucID, p_cap, theta, reynolds)\n",
    "    fields : list[str]\n",
    "        List of field names (subdirs in folder), e.g. vel_x, vel_y, pressure\n",
    "    seed_name: str\n",
    "        Name of the random seed variable, if present\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    Dict[Tuple[int, int, float], Dict[str, Any]]\n",
    "        Dictionary with parameter combinations as keys and file information as values.\n",
    "        The keys are tuples of (p_cap, theta) and values contain paths to field files.\n",
    "    \"\"\"\n",
    "    data = {}\n",
    "\n",
    "    # Function to extract parameters from filename\n",
    "    def extract_params(filename: str, params: list[str]) -> dict:\n",
    "        param_values = {}\n",
    "        for param in params:\n",
    "            param_value = re.search(rf\"{param}_(-?\\d+\\.?\\d*)\", filename).group(1)\n",
    "            # Convert parameter values to appropriate types based on parameter name\n",
    "            if param.endswith('theta'):  # Angle parameters\n",
    "                param_values[param] = float(round(np.rad2deg(float(param_value)), 2))\n",
    "            else:  # Default to float for other parameters\n",
    "                param_values[param] = float(param_value)\n",
    "\n",
    "        return param_values\n",
    "\n",
    "    # Search through each subdirectory\n",
    "    for subdir in fields:\n",
    "        subdir_path = data_path / subdir\n",
    "        if not subdir_path.exists():\n",
    "            print(f\"Warning: Directory {subdir} not found\")\n",
    "            continue\n",
    "\n",
    "        files = sorted(subdir_path.glob(\"*.csv\"))\n",
    "        for file in files:\n",
    "            # Extract parameters\n",
    "            # print(f\"Processing: {file}\")\n",
    "            param_values = extract_params(file.stem, params)\n",
    "            # Create parameter combination key from all params except strucID\n",
    "            # strucID is not included since we want to aggregate over all strucIDs\n",
    "            param_key = tuple(param_values[p] for p in params if p != seed_name)\n",
    "\n",
    "            # Initialize nested dictionaries if they don't exist\n",
    "            if param_key not in data:\n",
    "                data[param_key] = {field: {} for field in fields}\n",
    "\n",
    "\n",
    "            # Add file path under appropriate category\n",
    "            for field in fields:\n",
    "                if field in str(file):\n",
    "                    if seed_name is not None:\n",
    "                        seed = param_values[seed_name]\n",
    "                        data[param_key][field][seed] = file\n",
    "                    else:\n",
    "                        data[param_key][field] = file\n",
    "\n",
    "    return data\n",
    "\n",
    "\n",
    "def load_comsol_csv(file_path: Path, num_timesteps: int) -> tuple[np.ndarray, dict]:\n",
    "    \"\"\"\n",
    "    Load COMSOL CSV data and reshape it into (time, x, y) format.\n",
    "    Also return some metadata about the data.\n",
    "    \"\"\"\n",
    "    # Read CSV and sort by x then y coordinates\n",
    "    print(file_path)\n",
    "    df = pd.read_csv(file_path, sep=\";\", dtype=np.float32, header=None, skiprows=0)\n",
    "    df = df.sort_values(by=[df.columns[0], df.columns[1]]).reset_index(drop=True)\n",
    "\n",
    "    # Get coordinate information\n",
    "    x_coords = df[0].unique()\n",
    "    y_coords = df[1].unique()\n",
    "\n",
    "    # Calculate expected grid size and validate\n",
    "    grid_size = len(x_coords) * len(y_coords)\n",
    "    if len(df) != grid_size:\n",
    "        raise ValueError(\n",
    "            f\"Data grid mismatch: {len(df)} points vs expected {grid_size} ({len(x_coords)}x{len(y_coords)})\",\n",
    "            f\"Occured in file {file_path}\"\n",
    "\n",
    "        )\n",
    "\n",
    "    # Reshape directly to (x, y, time) then transpose to (time, x, y)\n",
    "    time_steps = df.columns[2:]\n",
    "    time_data = df[time_steps].values\n",
    "    data = time_data.reshape(len(x_coords), len(y_coords), len(time_steps))\n",
    "    if data.shape[2] != num_timesteps:\n",
    "        print(f\"Number of timesteps mismatch: {data.shape[2]} vs expected {num_timesteps}\")\n",
    "        return None, None\n",
    "    data = data.transpose(2, 0, 1)\n",
    "    \n",
    "    metadata = {\n",
    "        \"x_coords\": x_coords,\n",
    "        \"y_coords\": y_coords,\n",
    "        \"time_steps\": np.arange(len(time_steps)),\n",
    "    }\n",
    "\n",
    "    return data, metadata\n",
    "\n",
    "def load_comsol_csv_grid(file_path: Path) -> tuple[np.ndarray, dict]:\n",
    "    \"\"\"\n",
    "    Load COMSOL CSV data and reshape it into (time, x, y) format.\n",
    "    Also return some metadata about the data.\n",
    "\n",
    "    This function expects the grid layout of comsol outputs\n",
    "    \"\"\"\n",
    "\n",
    "    with open(file_path) as f:\n",
    "\n",
    "        # Get coordinate information\n",
    "        # Can't index file object like a list, need to read lines\n",
    "        lines = f.readlines()\n",
    "\n",
    "    if \"vel_y\" in str(file_path):\n",
    "        delim = \",\"\n",
    "    else:\n",
    "        delim = \";\"\n",
    "    x_coords = np.loadtxt([lines[9]], delimiter=delim, dtype=np.float32)\n",
    "    y_coords = np.loadtxt([lines[10]], delimiter=delim, dtype=np.float32)\n",
    "\n",
    "    # Find the line indices where blocks start (lines containing \"% Data\")\n",
    "    block_start_indices = []\n",
    "    for i, line in enumerate(lines):\n",
    "        if \"% Data\" in line:\n",
    "            block_start_indices.append(i)\n",
    "    \n",
    "    # Add the end of file as the last boundary\n",
    "    block_start_indices.append(len(lines))\n",
    "    \n",
    "    # Create blocks by chunking the lines\n",
    "    blocks = []\n",
    "    for i in range(len(block_start_indices) - 1):\n",
    "        start_idx = block_start_indices[i] + 2 # use +2 to get the first line with data\n",
    "        end_idx = block_start_indices[i + 1]\n",
    "        block = lines[start_idx:end_idx]\n",
    "        blocks.append(block)\n",
    "    \n",
    "    # Now we have the file chunked into blocks separated by \"% Data\" markers\n",
    "    data = []\n",
    "    for block in blocks:\n",
    "        arr = np.loadtxt(block, delimiter=delim, dtype=np.float32)\n",
    "        data.append(arr)\n",
    "    time_data = np.array(data)\n",
    "    # transpose x, y\n",
    "    time_data  = np.transpose(time_data, (0,2,1))\n",
    "\n",
    "\n",
    "    time_steps = len(blocks)\n",
    "    \n",
    "    metadata = {\n",
    "        \"x_coords\": x_coords,\n",
    "        \"y_coords\": y_coords,\n",
    "        \"time_steps\": np.arange(time_steps),\n",
    "    }\n",
    "\n",
    "    return time_data, metadata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_hdf5_dataset_porous_media(output_path: Path, param_key: tuple, data_dict: dict, split_id: int):\n",
    "    \"\"\"\n",
    "    Create HDF5 file with the specified format.\n",
    "    \"\"\"\n",
    "    p_cap, theta = param_key\n",
    "    filename = f\"porous_twophase_flow_p_cap_{p_cap}_theta_{theta}_split_{split_id}.hdf5\"\n",
    "\n",
    "    with h5py.File(output_path / filename, \"w\") as f:\n",
    "        # Root attributes\n",
    "        f.attrs[\"simulation_parameters\"] = [\"p_cap\", \"theta\"]\n",
    "        f.attrs[\"p_cap\"] = p_cap\n",
    "        f.attrs[\"theta\"] = theta\n",
    "        f.attrs[\"dataset_name\"] = \"COMSOL_TwoPhaseFlow\"\n",
    "        f.attrs[\"grid_type\"] = \"cartesian\"\n",
    "        f.attrs[\"n_spatial_dims\"] = 2\n",
    "        f.attrs[\"n_trajectories\"] = data_dict[\"n_trajectories\"]\n",
    "\n",
    "        # Load data from first file to get dimensions\n",
    "        x_coords = data_dict[\"x_coords\"]\n",
    "        y_coords = data_dict[\"y_coords\"]\n",
    "        time_steps = data_dict[\"time_steps\"]\n",
    "\n",
    "        # Create dimensions group\n",
    "        dims = f.create_group(\"dimensions\")\n",
    "        dims.attrs[\"spatial_dims\"] = [\"x\", \"y\"]\n",
    "\n",
    "        time_dset = dims.create_dataset(\"time\", data=time_steps)\n",
    "        time_dset.attrs[\"sample_varying\"] = False\n",
    "\n",
    "        x_dset = dims.create_dataset(\"x\", data=x_coords)\n",
    "        x_dset.attrs[\"sample_varying\"] = False\n",
    "        x_dset.attrs[\"time_varying\"] = False\n",
    "\n",
    "        y_dset = dims.create_dataset(\"y\", data=y_coords)\n",
    "        y_dset.attrs[\"sample_varying\"] = False\n",
    "        y_dset.attrs[\"time_varying\"] = False\n",
    "\n",
    "        # Create boundary conditions group\n",
    "        bc = f.create_group(\"boundary_conditions\")\n",
    "        \n",
    "        x_bc = bc.create_group(\"x_open\")\n",
    "        x_bc.attrs[\"associated_dims\"] = [\"x\"]\n",
    "        x_bc.attrs[\"associated_fields\"] = []\n",
    "        x_bc.attrs[\"bc_type\"] = \"open\"\n",
    "        x_bc.attrs[\"sample_varying\"] = False\n",
    "        x_bc.attrs[\"time_varying\"] = False\n",
    "\n",
    "        mask = np.zeros_like(x_coords, dtype=np.bool)\n",
    "        mask[0] = True\n",
    "        mask[-1] = True\n",
    "        x_bc.create_dataset(\"mask\", data=mask)\n",
    "        x_bc.create_dataset(\"values\", data=np.zeros_like(x_coords))\n",
    "\n",
    "        # y-boundary\n",
    "        y_bc = bc.create_group(\"y_wall\")\n",
    "        y_bc.attrs[\"associated_dims\"] = [\"y\"]\n",
    "        y_bc.attrs[\"associated_fields\"] = []\n",
    "        y_bc.attrs[\"bc_type\"] = \"wall\"\n",
    "        y_bc.attrs[\"sample_varying\"] = False\n",
    "        y_bc.attrs[\"time_varying\"] = False\n",
    "        mask = np.zeros_like(y_coords, dtype=np.bool)\n",
    "        mask[0] = True\n",
    "        mask[-1] = True\n",
    "        y_bc.create_dataset(\"mask\", data=mask)\n",
    "        y_bc.create_dataset(\"values\", data=np.zeros_like(y_coords))\n",
    "\n",
    "\n",
    "        # Create scalars group\n",
    "        scalars = f.create_group(\"scalars\")\n",
    "        scalars.attrs[\"field_names\"] = [\"p_cap\", \"theta\"]\n",
    "\n",
    "        p_cap_dset = scalars.create_dataset(\"p_cap\", data=p_cap)\n",
    "        p_cap_dset.attrs[\"sample_varying\"] = False\n",
    "        p_cap_dset.attrs[\"time_varying\"] = False\n",
    "\n",
    "        theta_dset = scalars.create_dataset(\"theta\", data=theta)\n",
    "        theta_dset.attrs[\"sample_varying\"] = False\n",
    "        theta_dset.attrs[\"time_varying\"] = False\n",
    "\n",
    "        # Create t0_fields group for pressure\n",
    "        t0_fields = f.create_group(\"t0_fields\")\n",
    "        t0_fields.attrs[\"field_names\"] = [\"pressure\", \"density\"]\n",
    "\n",
    "        # Load and store pressure field\n",
    "        pressure_dset = t0_fields.create_dataset(\n",
    "            \"pressure\", data=data_dict[\"pressure\"]\n",
    "        )\n",
    "        pressure_dset.attrs[\"dim_varying\"] = [True, True]\n",
    "        pressure_dset.attrs[\"sample_varying\"] = True\n",
    "        pressure_dset.attrs[\"time_varying\"] = True\n",
    "\n",
    "        density_dset = t0_fields.create_dataset(\n",
    "            \"density\", data=data_dict[\"phase_boundary\"]\n",
    "        )\n",
    "        density_dset.attrs[\"dim_varying\"] = [True, True]\n",
    "        density_dset.attrs[\"sample_varying\"] = True\n",
    "        density_dset.attrs[\"time_varying\"] = True\n",
    "\n",
    "        # Create t1_fields group for velocities\n",
    "        t1_fields = f.create_group(\"t1_fields\")\n",
    "        t1_fields.attrs[\"field_names\"] = [\"velocity\"]\n",
    "\n",
    "        # Load velocity components\n",
    "        velocity_dset = t1_fields.create_dataset(\n",
    "            \"velocity\", data=data_dict[\"velocity\"]\n",
    "        )\n",
    "        velocity_dset.attrs[\"dim_varying\"] = [True, True]\n",
    "        velocity_dset.attrs[\"sample_varying\"] = True\n",
    "        velocity_dset.attrs[\"time_varying\"] = True\n",
    "\n",
    "        # Create empty t2_fields group\n",
    "        t2_fields = f.create_group(\"t2_fields\")\n",
    "        t2_fields.attrs[\"field_names\"] = []\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the data\n",
    "raw_data_path = Path(r\"data/raw_comsol\")\n",
    "data_path = Path(r\"data/datasets/twophase_flow/data\")\n",
    "data_path.mkdir(parents=True, exist_ok=True)\n",
    "param_names = [\"p_cap\", \"theta\", \"strucID\"]\n",
    "seed_name = \"strucID\"\n",
    "fields = [\"vel_x\", \"vel_y\", \"phase_boundary\", \"pressure\"]\n",
    "\n",
    "# if larger than 1, split one hdf5 file into multiple\n",
    "split = -1\n",
    "\n",
    "data_files: dict[tuple[int, int], dict[str, dict[int, Path]]] = search_comsol_data(raw_data_path, param_names, fields, seed_name)\n",
    "\n",
    "for params_key, features in data_files.items():\n",
    "    print(\"Loading data for\", params_key)\n",
    "    data = {}\n",
    "    data[\"params\"] = params_key\n",
    "    for feature_name, trajectories in features.items():\n",
    "        print(\"\\tLoading\", feature_name)\n",
    "        trajectory_data = []\n",
    "        metadata_list = []\n",
    "        for trajectory_id, file_path in trajectories.items():\n",
    "            feature_array, metadata = load_comsol_csv(file_path, num_timesteps = 401)\n",
    "            if feature_array is None and metadata is None:\n",
    "                continue\n",
    "            trajectory_data.append(feature_array)\n",
    "            metadata_list.append(metadata)\n",
    "\n",
    "\n",
    "        if len(trajectory_data) == 0:\n",
    "            continue\n",
    "        trajectory_data = np.stack(trajectory_data, axis=0) # (n_trajectories, n_timesteps, x, y)\n",
    "        data[feature_name] = trajectory_data\n",
    "    if len(data) == 0:\n",
    "        continue\n",
    "\n",
    "    # Add metadata from the last trajectory\n",
    "    data[\"n_trajectories\"] = trajectory_data.shape[0]\n",
    "    data[\"x_coords\"] = metadata_list[-1][\"x_coords\"]\n",
    "    data[\"y_coords\"] = metadata_list[-1][\"y_coords\"]\n",
    "    data[\"time_steps\"] = metadata_list[-1][\"time_steps\"]\n",
    "\n",
    "    # join vel_u and vel_v\n",
    "    data[\"velocity\"] = np.stack([data[\"vel_x\"], data[\"vel_y\"]], axis=-1)\n",
    "    # remove vel_u and vel_v\n",
    "    del data[\"vel_x\"]\n",
    "    del data[\"vel_y\"]\n",
    "\n",
    "    if split > 1:\n",
    "        num_traj = data[\"n_trajectories\"] // split\n",
    "        for i in range(split):\n",
    "            part_data = {\n",
    "                \"params\": data[\"params\"],\n",
    "                \"x_coords\": data[\"x_coords\"],\n",
    "                \"y_coords\": data[\"y_coords\"],\n",
    "                \"time_steps\": data[\"time_steps\"]\n",
    "            }\n",
    "            \n",
    "            # Split each feature array\n",
    "            start_idx = i * num_traj\n",
    "            \n",
    "            # For the last split, take all remaining trajectories\n",
    "            if i == split - 1:\n",
    "                end_idx = data[\"n_trajectories\"]\n",
    "                part_data[\"n_trajectories\"] = end_idx - start_idx\n",
    "            else:\n",
    "                end_idx = (i + 1) * num_traj\n",
    "                part_data[\"n_trajectories\"] = num_traj\n",
    "            \n",
    "            for feature_name in [\"pressure\", \"rho\", \"temp\", \"velocity\"]:\n",
    "                part_data[feature_name] = data[feature_name][start_idx:end_idx]\n",
    "                \n",
    "            # Create HDF5 file for this part\n",
    "            create_hdf5_dataset_porous_media(data_path, params_key, part_data, i)\n",
    "            print(f\"Created {params_key} part {i+1}/{split} hdf5 file\")\n",
    "\n",
    "    else:\n",
    "        create_hdf5_dataset_porous_media(data_path, params_key, data, split_id = -1)\n",
    "        print(f\"Created {params_key} hdf5 file\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import shutil\n",
    "import random\n",
    "from pathlib import Path\n",
    "\n",
    "# data_path = Path(\"C:/Users/zsa8rk/Coding/Large-Physics-Foundation-Model/data/datasets/cooled_object_pipe_flow_air/data\")\n",
    "def split_datasets(data_path: Path, train_ratio: float = 0.8, val_ratio: float = 0.1, test_ratio: float = 0.1):\n",
    "    \"\"\"Split hdf5 files into train/val/test directories.\n",
    "    \n",
    "    Parameters\n",
    "    ----------\n",
    "    data_path : Path\n",
    "        Path to directory containing hdf5 files\n",
    "    train_ratio : float, optional\n",
    "        Ratio of files to use for training, by default 0.8\n",
    "    val_ratio : float, optional\n",
    "        Ratio of files to use for validation, by default 0.1\n",
    "    test_ratio : float, optional\n",
    "        Ratio of files to use for testing, by default 0.1\n",
    "    \"\"\"\n",
    "    # Create subdirectories\n",
    "    train_dir = data_path / \"train\"\n",
    "    val_dir = data_path / \"valid\" \n",
    "    test_dir = data_path / \"test\"\n",
    "    \n",
    "    for dir in [train_dir, val_dir, test_dir]:\n",
    "        dir.mkdir(exist_ok=True)\n",
    "    \n",
    "    # Get list of hdf5 files\n",
    "    hdf5_files = list(data_path.glob(\"*.hdf5\"))\n",
    "    \n",
    "    # Shuffle files\n",
    "    random.shuffle(hdf5_files)\n",
    "    \n",
    "    # Calculate split indices\n",
    "    n_files = len(hdf5_files)\n",
    "    n_train = int(n_files * train_ratio)\n",
    "    n_val = int(n_files * val_ratio)\n",
    "    \n",
    "    # Split files\n",
    "    train_files = hdf5_files[:n_train]\n",
    "    val_files = hdf5_files[n_train:n_train + n_val]\n",
    "    test_files = hdf5_files[n_train + n_val:]\n",
    "    \n",
    "    # Move files to respective directories\n",
    "    for file in train_files:\n",
    "        shutil.move(file, train_dir / file.name)\n",
    "    \n",
    "    for file in val_files:\n",
    "        shutil.move(file, val_dir / file.name)\n",
    "        \n",
    "    for file in test_files:\n",
    "        shutil.move(file, test_dir / file.name)\n",
    "    print(f\"Split {n_files} files into:\")\n",
    "    print(f\"Train: {len(train_files)} files\")\n",
    "    print(f\"Validation: {len(val_files)} files\") \n",
    "    print(f\"Test: {len(test_files)} files\")\n",
    "\n",
    "# Split the datasets\n",
    "split_datasets(data_path)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
