{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "79a955c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "import pandas as pd\n",
    "import os\n",
    "import shutil\n",
    "\n",
    "def reorganize_data(origin_path: str, sample_sheet_path: str, destination_path: str):\n",
    "    # specify the subdirectory name for each modality of interest\n",
    "    type_extension = {\n",
    "        \"Gene Expression Quantification\": \"gene_expression\",\n",
    "        \"Gene Level Copy Number\": \"cnv\",\n",
    "        \"Slide Image\": \"images\",\n",
    "        \"Methylation Beta Value\": \"dna_methylation\"\n",
    "    }\n",
    "    ## Create a directory at the destination\n",
    "    if not os.path.isdir(destination_path):\n",
    "        print(\"Generating new data path.\")\n",
    "        os.mkdir(destination_path)\n",
    "    else:\n",
    "        print(\"New data path already exists.\")\n",
    "    ## In the new directory, add folders for each case\n",
    "    # Read cases\n",
    "    sample_data = pd.read_csv(sample_sheet_path, sep = \"\\t\")\n",
    "    modalities = list(type_extension.values())\n",
    "    cases = get_cases(sample_data)\n",
    "    print(\"Adding case folders.\")\n",
    "    for case in cases:\n",
    "        path = destination_path + \"/\"+ case\n",
    "        print(\"Creating path\", path)\n",
    "        try:\n",
    "            os.mkdir(path)\n",
    "            mk_modalities_folders(path, modalities)\n",
    "        except OSError:\n",
    "            print(\"Creation of the directory %s failed\" % path)\n",
    "        else:\n",
    "            print(\"Successfully created directory %s \" % path)\n",
    "    count = 0\n",
    "    for index,row in sample_data.iterrows():\n",
    "        assoc_case = row['Case ID'].split(\",\")[0].strip()\n",
    "        assoc_type = row['Data Type']\n",
    "        assoc_file = row['File ID']\n",
    "        assoc_file_name = row['File Name']\n",
    "        source = os.path.join(origin_path, assoc_file, assoc_file_name)\n",
    "        if assoc_type in type_extension:\n",
    "            dest = os.path.join(destination_path, assoc_case, type_extension[assoc_type])\n",
    "        else:\n",
    "            dest = os.path.join(destination_path, assoc_case, \"other\")\n",
    "        print(\"Moving\", source, \"to\", dest)\n",
    "        try:\n",
    "            shutil.move(source, dest)\n",
    "            count +=1\n",
    "        except:\n",
    "            print(\"Error in moving data from original data to new data\")\n",
    "            print(\"Filename: \" + assoc_case + \" - \" + assoc_file_name)\n",
    "    print(f\"Moved {count} files\")\n",
    "\n",
    "def get_cases(sample_data):\n",
    "    cases_raw = sample_data['Case ID'].values.tolist()\n",
    "    cases_processed = set()\n",
    "    for case in cases_raw:\n",
    "        cases_processed.add(case.split(\",\")[0].strip())\n",
    "    return list(cases_processed)\n",
    "\n",
    "def mk_modalities_folders(path, modalities):\n",
    "    for modality in modalities:\n",
    "        p = os.path.join(path, modality)\n",
    "        os.mkdir(p)\n",
    "    os.mkdir(os.path.join(path, \"other\"))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "571b8937",
   "metadata": {},
   "outputs": [],
   "source": [
    "origin_path = \".../TCGA/\"\n",
    "sample_sheet_path = \".../TCGA/gdc_sample_sheet.DATE.tsv\" #DATE -> the date when you download the data, so it varies per download\n",
    "destination_path = \".../TCGA/data_by_cases\"\n",
    "reorganize_data(origin_path = origin_path, sample_sheet_path = sample_sheet_path, destination_path = destination_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7206581c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
