{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0b4c06ba",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import json\n",
    "from tqdm import tqdm\n",
    "from multiprocessing import Pool, cpu_count\n",
    "import math\n",
    "import shutil\n",
    "from scipy.stats import skew, kurtosis # Import for calculating skewness and kurtosis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "42a0975b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Dictionary mapping equipment names to numeric IDs (in the order of Edge 0 through 8)\n",
    "EQUIPMENT_MAP = {\n",
    "    'ACSR-OC': 0, 'CNCV-W': 1, 'TFR-CV': 2,\n",
    "    '계기용변압기': 3, '단상유입변압기': 4, '전력용유입변압기': 5,\n",
    "    '7.2kV배전반': 6, '22.9kV배전반': 7, '25.8kVGIS': 8\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "05896da5",
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_features(data_array):\n",
    "    \"\"\"\n",
    "    Extracts six statistical characteristics from a one-dimensional array of the form (7680, ).\n",
    "    \"\"\"\n",
    "    return [\n",
    "        np.mean(data_array),\n",
    "        np.std(data_array),\n",
    "        np.max(data_array),\n",
    "        np.min(data_array),\n",
    "        skew(data_array),\n",
    "        kurtosis(data_array)\n",
    "    ]\n",
    "\n",
    "def process_file_pair_feature_extraction(file_pair):\n",
    "    \"\"\"\n",
    "    Reads a CSV file and returns an array of features in the form (20, 6) along with labels.\n",
    "    \"\"\"\n",
    "    csv_path, json_path, edge_id = file_pair\n",
    "    try:\n",
    "        # Reading with header=None and dtype=np.float32 improves speed/memory efficiency\n",
    "        df = pd.read_csv(csv_path, header=None, dtype=np.float32)\n",
    "        if df.shape != (20, 7680):\n",
    "            return None\n",
    "        \n",
    "        # Apply the feature extraction function to each row (20 rows) using the apply function\n",
    "        # Convert the final result into a numpy array of shape (20, 6) using to_numpy\n",
    "        features = df.apply(extract_features, axis=1, result_type='expand').to_numpy()\n",
    "\n",
    "        with open(json_path, 'r', encoding='utf-8-sig') as f:\n",
    "            label_content = json.load(f)\n",
    "            label = label_content.get('label', {}).get('PD_type')\n",
    "\n",
    "            if label is not None:\n",
    "                return (features.astype(np.float32), np.int64(label), edge_id)\n",
    "            return None\n",
    "    except Exception as e:\n",
    "        print(f'Error: {e}')\n",
    "        return None\n",
    "\n",
    "def create_dataset_parallel(base_path, is_training=True):\n",
    "    \"\"\"\n",
    "    (Final Version) Apply feature extraction and parallel processing to build datasets quickly and efficiently.\n",
    "    \"\"\"\n",
    "    data_type = 'Training' if is_training else 'Validation'\n",
    "    source_base_dir = os.path.join(base_path, data_type, '01.원천데이터')\n",
    "    label_base_dir = os.path.join(base_path, data_type, '02.라벨링데이터')\n",
    "\n",
    "    print(f\"--- Starting {data_type} data processing (Feature Extraction Version) ---\")\n",
    "    \n",
    "    tasks = []\n",
    "    source_folders = [d for d in os.listdir(source_base_dir) if os.path.isdir(os.path.join(source_base_dir, d))]\n",
    "    \n",
    "    for source_folder_name in source_folders:\n",
    "        equipment_name = source_folder_name.split('_')[-1]\n",
    "        if equipment_name not in EQUIPMENT_MAP: continue\n",
    "        edge_id = EQUIPMENT_MAP[equipment_name]\n",
    "\n",
    "        label_folder_name = source_folder_name.replace('TS_', 'TL_').replace('VS_', 'VL_')\n",
    "        source_folder_path = os.path.join(source_base_dir, source_folder_name)\n",
    "        label_folder_path = os.path.join(label_base_dir, label_folder_name)\n",
    "        if not os.path.exists(label_folder_path): continue\n",
    "        source_files = {f.replace('.csv', '') for f in os.listdir(source_folder_path) if f.endswith('.csv')}\n",
    "        label_files = {f.replace('.json', '') for f in os.listdir(label_folder_path) if f.endswith('.json')}\n",
    "        matched_basenames = source_files.intersection(label_files)\n",
    "        for base_filename in matched_basenames:\n",
    "            tasks.append((os.path.join(source_folder_path, f\"{base_filename}.csv\"), \n",
    "                          os.path.join(label_folder_path, f\"{base_filename}.json\"),\n",
    "                          edge_id))\n",
    "    \n",
    "    if not tasks:\n",
    "        print(f\"No matched file pairs found for {data_type}.\")\n",
    "        return\n",
    "    \n",
    "    # 2. Explicit Temporary Folder Creation\n",
    "    temp_chunk_dir = os.path.join(base_path, 'processed_chunks', data_type)\n",
    "    if os.path.exists(temp_chunk_dir):\n",
    "        print(f\"'{temp_chunk_dir}' already exists. Clearing it before starting.\")\n",
    "        shutil.rmtree(temp_chunk_dir)\n",
    "    os.makedirs(temp_chunk_dir)\n",
    "    print(f\"Intermediate chunk files will be stored in: {temp_chunk_dir}\")\n",
    "\n",
    "    # 3. Processing data in chunks\n",
    "    chunk_size = 10000\n",
    "    num_chunks = math.ceil(len(tasks) / chunk_size)\n",
    "    \n",
    "    print(f\"Found {len(tasks)} file pairs. Processing in {num_chunks} chunks of size {chunk_size}...\")\n",
    "    \n",
    "    for i in range(num_chunks):\n",
    "        chunk_tasks = tasks[i * chunk_size : (i + 1) * chunk_size]\n",
    "        \n",
    "        with Pool(processes=cpu_count()) as pool:\n",
    "            chunk_results = list(tqdm(pool.imap_unordered(process_file_pair_feature_extraction, chunk_tasks), \n",
    "                                      total=len(chunk_tasks), \n",
    "                                      desc=f\"Processing chunk {i+1}/{num_chunks}\"))\n",
    "        \n",
    "        valid_results = [res for res in chunk_results if res is not None]\n",
    "        \n",
    "        if valid_results:\n",
    "            temp_filename = os.path.join(temp_chunk_dir, f\"chunk_{i}.npz\")\n",
    "            chunk_data, chunk_labels, chunk_edge_ids = zip(*valid_results)\n",
    "            np.savez_compressed(temp_filename, data=np.array(chunk_data), labels=np.array(chunk_labels), edge_ids=np.array(chunk_edge_ids, dtype=np.int64))\n",
    "\n",
    "    # 4. Merge all temporary files into one\n",
    "    print(\"\\nAll chunks processed. Merging temporary files into final dataset...\")\n",
    "    all_data = []\n",
    "    all_labels = []\n",
    "    all_edge_ids = []\n",
    "    temp_files = [os.path.join(temp_chunk_dir, f) for f in os.listdir(temp_chunk_dir) if f.endswith('.npz')]\n",
    "    \n",
    "    for temp_file in tqdm(temp_files, desc=\"Merging chunks\"):\n",
    "        with np.load(temp_file) as loaded:\n",
    "            all_data.append(loaded['data'])\n",
    "            all_labels.append(loaded['labels'])\n",
    "            all_edge_ids.append(loaded['edge_ids'])\n",
    "    \n",
    "    if not all_data:\n",
    "        print(f\"No data could be successfully processed for {data_type}.\")\n",
    "        return\n",
    "        \n",
    "    X = np.concatenate(all_data, axis=0)\n",
    "    y = np.concatenate(all_labels, axis=0)\n",
    "    edge_ids = np.concatenate(all_edge_ids, axis=0)\n",
    "\n",
    "    # Save Final Output\n",
    "    print(\"\\nMerge complete. Saving final arrays...\")\n",
    "    output_dir = os.path.join(base_path, 'processed')\n",
    "    os.makedirs(output_dir, exist_ok=True)\n",
    "    file_prefix = 'X_train' if is_training else 'X_val'\n",
    "    np.save(os.path.join(output_dir, f'{file_prefix}.npy'), X)\n",
    "    np.save(os.path.join(output_dir, f'{file_prefix.replace(\"X\", \"y\")}.npy'), y)\n",
    "    np.save(os.path.join(output_dir, f'edge_ids_train.npy' if is_training else 'edge_ids_val.npy'), edge_ids)\n",
    "\n",
    "    print(f\"\\nSuccessfully created final dataset for {data_type}.\")\n",
    "    print(f\"Final data shape (X): {X.shape}, Label shape (y): {y.shape}\")\n",
    "    print(f\"Saved to {output_dir}\")\n",
    "    \n",
    "    # 6. Delete Temporary Chunk Folder (Optional)\n",
    "    # Uncomment the line below to automatically delete the temporary folder after all tasks are complete.\n",
    "    # print(f\"Cleaning up temporary chunk directory: {temp_chunk_dir}\")\n",
    "    # shutil.rmtree(temp_chunk_dir)\n",
    "    # print(\"Cleanup complete.\")\n",
    "    \n",
    "    print(\"-\" * 20)\n",
    "    return X, y, edge_ids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "36eb1b7f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# # --- Script Execution ---\n",
    "if __name__ == '__main__':\n",
    "    # !!! IMPORTANT !!!\n",
    "    # Specify the actual parent directory path where the dataset is located.\n",
    "    # Example: C:/Users/YourName/Desktop/Partial_Discharge_Dataset\n",
    "    DATASET_BASE_PATH = '' # Set relative to the current directory\n",
    "\n",
    "    # Training 데이터셋 구축\n",
    "    create_dataset_parallel(DATASET_BASE_PATH, is_training=True)\n",
    "\n",
    "    # Validation 데이터셋 구축\n",
    "    create_dataset_parallel(DATASET_BASE_PATH, is_training=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "DuoGAT",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
