{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "\n",
        "def convert_tcga_format_fixed(column_name):\n",
        "    \"\"\"\n",
        "    Convert TCGA format from 'tcga3za93z01' to 'TCGA.3Z.A93Z.01'\n",
        "    Correct pattern analysis:\n",
        "    tcga3za93z01 (12 chars) -> TCGA.3Z.A93Z.01 (15 chars)\n",
        "    \n",
        "    Breaking down the 12-character input:\n",
        "    - tcga (4 chars) -> TCGA.\n",
        "    - 3z (2 chars) -> 3Z.\n",
        "    - a (1 char) -> A\n",
        "    - 93z (3 chars) -> 93Z.\n",
        "    - 01 (2 chars) -> 01\n",
        "    \"\"\"\n",
        "    if column_name.startswith('tcga') and len(column_name) == 12:\n",
        "        # Convert to uppercase first\n",
        "        upper_col = column_name.upper()\n",
        "        \n",
        "        # Correct segmentation based on actual pattern\n",
        "        tcga_part = upper_col[0:4]      # 'TCGA'\n",
        "        project = upper_col[4:6]        # '3Z'\n",
        "        tss = upper_col[6:7]            # 'A'\n",
        "        participant = upper_col[7:10]   # '93Z' (3 chars, not 4!)\n",
        "        sample = upper_col[10:12]       # '01'\n",
        "        \n",
        "        # Reconstruct with proper dots\n",
        "        return f\"{tcga_part}.{project}.{tss}{participant}.{sample}\"\n",
        "    \n",
        "    return column_name\n",
        "\n",
        "# Test with known examples\n",
        "test_cases = [\n",
        "    ('tcga3za93z01', 'TCGA.3Z.A93Z.01'),\n",
        "    ('tcga6daa2e01', 'TCGA.6D.AA2E.01'),\n",
        "    ('tcgaa3335701', 'TCGA.A3.3357.01')\n",
        "]\n",
        "\n",
        "print(\"Testing FIXED conversion function:\")\n",
        "for input_col, expected in test_cases:\n",
        "    result = convert_tcga_format_fixed(input_col)\n",
        "    match = \"✓\" if result == expected else \"✗\"\n",
        "    print(f\"{match} {input_col} -> {result} (expected: {expected})\")\n",
        "\n",
        "# If all tests pass, re-clean the tables\n",
        "all_passed = all(convert_tcga_format_fixed(input_col) == expected for input_col, expected in test_cases)\n",
        "\n",
        "if all_passed:\n",
        "    print(\"\\n✓ All tests pass! Re-cleaning tables with correct function...\")\n",
        "    \n",
        "    # List of tables to clean\n",
        "    tables_to_clean = [\n",
        "        ('KIRC_CNV_top', '/workdir/KIRC_CNV_top.csv'),\n",
        "        ('KIRC_Methy_top', '/workdir/KIRC_Methy_top.csv'),\n",
        "        ('KIRC_mRNA_top', '/workdir/KIRC_mRNA_top.csv')\n",
        "    ]\n",
        "    \n",
        "    for table_name, file_path in tables_to_clean:\n",
        "        print(f\"\\nRe-processing {table_name}...\")\n",
        "        \n",
        "        # Load the original table\n",
        "        df = pd.read_csv(file_path)\n",
        "        print(f\"  Shape: {df.shape}\")\n",
        "        \n",
        "        # Get current column names\n",
        "        original_columns = df.columns.tolist()\n",
        "        \n",
        "        # Convert column names with fixed function\n",
        "        new_columns = [convert_tcga_format_fixed(col) for col in original_columns]\n",
        "        \n",
        "        # Count changes\n",
        "        changes = sum(1 for orig, new in zip(original_columns, new_columns) if orig != new)\n",
        "        print(f\"  Converted {changes} column names\")\n",
        "        \n",
        "        # Apply new column names\n",
        "        df.columns = new_columns\n",
        "        \n",
        "        # Save cleaned table (overwrite the previous versions)\n",
        "        output_path = f\"/workdir/execution_outputs/{table_name}_column_cleaned.csv\"\n",
        "        df.to_csv(output_path, index=False)\n",
        "        print(f\"  Updated cleaned table: {output_path}\")\n",
        "    \n",
        "    print(\"\\nFinal verification:\")\n",
        "    # Load survival data for verification\n",
        "    survival_df = pd.read_csv(\"/workdir/survival_KIRC.csv\")\n",
        "    survival_samples = set(survival_df['sample_name'].tolist())\n",
        "    \n",
        "    # Check one cleaned table\n",
        "    cnv_cleaned = pd.read_csv(\"/workdir/execution_outputs/KIRC_CNV_top_column_cleaned.csv\")\n",
        "    cnv_patient_cols = [col for col in cnv_cleaned.columns if col.startswith('TCGA')][:10]\n",
        "    \n",
        "    matches = sum(1 for col in cnv_patient_cols if col in survival_samples)\n",
        "    print(f\"Verification: {matches}/{len(cnv_patient_cols)} column names match survival sample names\")\n",
        "    print(\"Sample cleaned column names:\", cnv_patient_cols[:5])\n",
        "    \n",
        "else:\n",
        "    print(\"\\n✗ Tests failed. Need to debug further.\")\n"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.8.0"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 4
}