{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('liver_dataset.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0.1</th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>train</th>\n",
       "      <th>slice_id</th>\n",
       "      <th>image</th>\n",
       "      <th>mask</th>\n",
       "      <th>liver</th>\n",
       "      <th>cancer</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>26529</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>liver_97_0.npy</td>\n",
       "      <td>liver_images/liver_97_0.npy</td>\n",
       "      <td>liver_images/liver_97_0.npy</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>26530</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>liver_97_1.npy</td>\n",
       "      <td>liver_images/liver_97_1.npy</td>\n",
       "      <td>liver_images/liver_97_1.npy</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>26531</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>liver_97_2.npy</td>\n",
       "      <td>liver_images/liver_97_2.npy</td>\n",
       "      <td>liver_images/liver_97_2.npy</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>26532</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>liver_97_3.npy</td>\n",
       "      <td>liver_images/liver_97_3.npy</td>\n",
       "      <td>liver_images/liver_97_3.npy</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>26533</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>liver_97_4.npy</td>\n",
       "      <td>liver_images/liver_97_4.npy</td>\n",
       "      <td>liver_images/liver_97_4.npy</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>58633</th>\n",
       "      <td>85162</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>liver_10_496.npy</td>\n",
       "      <td>liver_images/liver_10_496.npy</td>\n",
       "      <td>liver_images/liver_10_496.npy</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>58634</th>\n",
       "      <td>85163</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>liver_10_497.npy</td>\n",
       "      <td>liver_images/liver_10_497.npy</td>\n",
       "      <td>liver_images/liver_10_497.npy</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>58635</th>\n",
       "      <td>85164</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>liver_10_498.npy</td>\n",
       "      <td>liver_images/liver_10_498.npy</td>\n",
       "      <td>liver_images/liver_10_498.npy</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>58636</th>\n",
       "      <td>85165</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>liver_10_499.npy</td>\n",
       "      <td>liver_images/liver_10_499.npy</td>\n",
       "      <td>liver_images/liver_10_499.npy</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>58637</th>\n",
       "      <td>85166</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>liver_10_500.npy</td>\n",
       "      <td>liver_images/liver_10_500.npy</td>\n",
       "      <td>liver_images/liver_10_500.npy</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>58638 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       Unnamed: 0.1  Unnamed: 0  train          slice_id  \\\n",
       "0             26529           0      1    liver_97_0.npy   \n",
       "1             26530           0      1    liver_97_1.npy   \n",
       "2             26531           0      1    liver_97_2.npy   \n",
       "3             26532           0      1    liver_97_3.npy   \n",
       "4             26533           0      1    liver_97_4.npy   \n",
       "...             ...         ...    ...               ...   \n",
       "58633         85162           0      1  liver_10_496.npy   \n",
       "58634         85163           0      1  liver_10_497.npy   \n",
       "58635         85164           0      1  liver_10_498.npy   \n",
       "58636         85165           0      1  liver_10_499.npy   \n",
       "58637         85166           0      1  liver_10_500.npy   \n",
       "\n",
       "                               image                           mask  liver  \\\n",
       "0        liver_images/liver_97_0.npy    liver_images/liver_97_0.npy    0.0   \n",
       "1        liver_images/liver_97_1.npy    liver_images/liver_97_1.npy    0.0   \n",
       "2        liver_images/liver_97_2.npy    liver_images/liver_97_2.npy    0.0   \n",
       "3        liver_images/liver_97_3.npy    liver_images/liver_97_3.npy    0.0   \n",
       "4        liver_images/liver_97_4.npy    liver_images/liver_97_4.npy    0.0   \n",
       "...                              ...                            ...    ...   \n",
       "58633  liver_images/liver_10_496.npy  liver_images/liver_10_496.npy    0.0   \n",
       "58634  liver_images/liver_10_497.npy  liver_images/liver_10_497.npy    0.0   \n",
       "58635  liver_images/liver_10_498.npy  liver_images/liver_10_498.npy    0.0   \n",
       "58636  liver_images/liver_10_499.npy  liver_images/liver_10_499.npy    0.0   \n",
       "58637  liver_images/liver_10_500.npy  liver_images/liver_10_500.npy    0.0   \n",
       "\n",
       "       cancer  \n",
       "0         0.0  \n",
       "1         0.0  \n",
       "2         0.0  \n",
       "3         0.0  \n",
       "4         0.0  \n",
       "...       ...  \n",
       "58633     0.0  \n",
       "58634     0.0  \n",
       "58635     0.0  \n",
       "58636     0.0  \n",
       "58637     0.0  \n",
       "\n",
       "[58638 rows x 8 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>patient_id</th>\n",
       "      <th>breast_density</th>\n",
       "      <th>left or right breast</th>\n",
       "      <th>image view</th>\n",
       "      <th>abnormality id</th>\n",
       "      <th>abnormality type</th>\n",
       "      <th>mass shape</th>\n",
       "      <th>mass margins</th>\n",
       "      <th>assessment</th>\n",
       "      <th>pathology</th>\n",
       "      <th>subtlety</th>\n",
       "      <th>image file path</th>\n",
       "      <th>cropped image file path</th>\n",
       "      <th>ROI mask file path</th>\n",
       "      <th>image_exists</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>P_00016</td>\n",
       "      <td>4</td>\n",
       "      <td>LEFT</td>\n",
       "      <td>CC</td>\n",
       "      <td>1</td>\n",
       "      <td>mass</td>\n",
       "      <td>IRREGULAR</td>\n",
       "      <td>SPICULATED</td>\n",
       "      <td>5</td>\n",
       "      <td>MALIGNANT</td>\n",
       "      <td>5</td>\n",
       "      <td>processed_files/mass_test_full_mammogram_image...</td>\n",
       "      <td>processed_files/mass_test_roi_mask_and_cropped...</td>\n",
       "      <td>processed_files/mass_test_roi_mask_and_cropped...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>P_00016</td>\n",
       "      <td>4</td>\n",
       "      <td>LEFT</td>\n",
       "      <td>MLO</td>\n",
       "      <td>1</td>\n",
       "      <td>mass</td>\n",
       "      <td>IRREGULAR</td>\n",
       "      <td>SPICULATED</td>\n",
       "      <td>5</td>\n",
       "      <td>MALIGNANT</td>\n",
       "      <td>5</td>\n",
       "      <td>processed_files/mass_test_full_mammogram_image...</td>\n",
       "      <td>processed_files/mass_test_roi_mask_and_cropped...</td>\n",
       "      <td>processed_files/mass_test_roi_mask_and_cropped...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>P_00017</td>\n",
       "      <td>2</td>\n",
       "      <td>LEFT</td>\n",
       "      <td>CC</td>\n",
       "      <td>1</td>\n",
       "      <td>mass</td>\n",
       "      <td>ROUND</td>\n",
       "      <td>CIRCUMSCRIBED</td>\n",
       "      <td>4</td>\n",
       "      <td>MALIGNANT</td>\n",
       "      <td>4</td>\n",
       "      <td>processed_files/mass_test_full_mammogram_image...</td>\n",
       "      <td>processed_files/mass_test_roi_mask_and_cropped...</td>\n",
       "      <td>processed_files/mass_test_roi_mask_and_cropped...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>P_00017</td>\n",
       "      <td>2</td>\n",
       "      <td>LEFT</td>\n",
       "      <td>MLO</td>\n",
       "      <td>1</td>\n",
       "      <td>mass</td>\n",
       "      <td>ROUND</td>\n",
       "      <td>ILL_DEFINED</td>\n",
       "      <td>4</td>\n",
       "      <td>MALIGNANT</td>\n",
       "      <td>4</td>\n",
       "      <td>processed_files/mass_test_full_mammogram_image...</td>\n",
       "      <td>processed_files/mass_test_roi_mask_and_cropped...</td>\n",
       "      <td>processed_files/mass_test_roi_mask_and_cropped...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>P_00032</td>\n",
       "      <td>3</td>\n",
       "      <td>RIGHT</td>\n",
       "      <td>CC</td>\n",
       "      <td>1</td>\n",
       "      <td>mass</td>\n",
       "      <td>ROUND</td>\n",
       "      <td>OBSCURED</td>\n",
       "      <td>0</td>\n",
       "      <td>BENIGN</td>\n",
       "      <td>2</td>\n",
       "      <td>processed_files/mass_test_full_mammogram_image...</td>\n",
       "      <td>processed_files/mass_test_roi_mask_and_cropped...</td>\n",
       "      <td>processed_files/mass_test_roi_mask_and_cropped...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>372</th>\n",
       "      <td>373</td>\n",
       "      <td>P_01825</td>\n",
       "      <td>2</td>\n",
       "      <td>RIGHT</td>\n",
       "      <td>MLO</td>\n",
       "      <td>1</td>\n",
       "      <td>mass</td>\n",
       "      <td>LOBULATED</td>\n",
       "      <td>MICROLOBULATED</td>\n",
       "      <td>3</td>\n",
       "      <td>BENIGN_WITHOUT_CALLBACK</td>\n",
       "      <td>3</td>\n",
       "      <td>processed_files/mass_test_full_mammogram_image...</td>\n",
       "      <td>processed_files/mass_test_roi_mask_and_cropped...</td>\n",
       "      <td>processed_files/mass_test_roi_mask_and_cropped...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>373</th>\n",
       "      <td>374</td>\n",
       "      <td>P_01833</td>\n",
       "      <td>2</td>\n",
       "      <td>RIGHT</td>\n",
       "      <td>MLO</td>\n",
       "      <td>1</td>\n",
       "      <td>mass</td>\n",
       "      <td>IRREGULAR</td>\n",
       "      <td>ILL_DEFINED</td>\n",
       "      <td>5</td>\n",
       "      <td>MALIGNANT</td>\n",
       "      <td>5</td>\n",
       "      <td>processed_files/mass_test_full_mammogram_image...</td>\n",
       "      <td>processed_files/mass_test_roi_mask_and_cropped...</td>\n",
       "      <td>processed_files/mass_test_roi_mask_and_cropped...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>374</th>\n",
       "      <td>375</td>\n",
       "      <td>P_01865</td>\n",
       "      <td>2</td>\n",
       "      <td>LEFT</td>\n",
       "      <td>MLO</td>\n",
       "      <td>1</td>\n",
       "      <td>mass</td>\n",
       "      <td>IRREGULAR</td>\n",
       "      <td>ILL_DEFINED</td>\n",
       "      <td>4</td>\n",
       "      <td>MALIGNANT</td>\n",
       "      <td>2</td>\n",
       "      <td>processed_files/mass_test_full_mammogram_image...</td>\n",
       "      <td>processed_files/mass_test_roi_mask_and_cropped...</td>\n",
       "      <td>processed_files/mass_test_roi_mask_and_cropped...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>375</th>\n",
       "      <td>376</td>\n",
       "      <td>P_01912</td>\n",
       "      <td>3</td>\n",
       "      <td>RIGHT</td>\n",
       "      <td>CC</td>\n",
       "      <td>1</td>\n",
       "      <td>mass</td>\n",
       "      <td>IRREGULAR</td>\n",
       "      <td>SPICULATED</td>\n",
       "      <td>5</td>\n",
       "      <td>MALIGNANT</td>\n",
       "      <td>4</td>\n",
       "      <td>processed_files/mass_test_full_mammogram_image...</td>\n",
       "      <td>processed_files/mass_test_roi_mask_and_cropped...</td>\n",
       "      <td>processed_files/mass_test_roi_mask_and_cropped...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>376</th>\n",
       "      <td>377</td>\n",
       "      <td>P_01912</td>\n",
       "      <td>3</td>\n",
       "      <td>RIGHT</td>\n",
       "      <td>MLO</td>\n",
       "      <td>1</td>\n",
       "      <td>mass</td>\n",
       "      <td>IRREGULAR</td>\n",
       "      <td>SPICULATED</td>\n",
       "      <td>5</td>\n",
       "      <td>MALIGNANT</td>\n",
       "      <td>4</td>\n",
       "      <td>processed_files/mass_test_full_mammogram_image...</td>\n",
       "      <td>processed_files/mass_test_roi_mask_and_cropped...</td>\n",
       "      <td>processed_files/mass_test_roi_mask_and_cropped...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>377 rows × 16 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Unnamed: 0 patient_id  breast_density left or right breast image view  \\\n",
       "0             0    P_00016               4                 LEFT         CC   \n",
       "1             1    P_00016               4                 LEFT        MLO   \n",
       "2             2    P_00017               2                 LEFT         CC   \n",
       "3             3    P_00017               2                 LEFT        MLO   \n",
       "4             4    P_00032               3                RIGHT         CC   \n",
       "..          ...        ...             ...                  ...        ...   \n",
       "372         373    P_01825               2                RIGHT        MLO   \n",
       "373         374    P_01833               2                RIGHT        MLO   \n",
       "374         375    P_01865               2                 LEFT        MLO   \n",
       "375         376    P_01912               3                RIGHT         CC   \n",
       "376         377    P_01912               3                RIGHT        MLO   \n",
       "\n",
       "     abnormality id abnormality type mass shape    mass margins  assessment  \\\n",
       "0                 1             mass  IRREGULAR      SPICULATED           5   \n",
       "1                 1             mass  IRREGULAR      SPICULATED           5   \n",
       "2                 1             mass      ROUND   CIRCUMSCRIBED           4   \n",
       "3                 1             mass      ROUND     ILL_DEFINED           4   \n",
       "4                 1             mass      ROUND        OBSCURED           0   \n",
       "..              ...              ...        ...             ...         ...   \n",
       "372               1             mass  LOBULATED  MICROLOBULATED           3   \n",
       "373               1             mass  IRREGULAR     ILL_DEFINED           5   \n",
       "374               1             mass  IRREGULAR     ILL_DEFINED           4   \n",
       "375               1             mass  IRREGULAR      SPICULATED           5   \n",
       "376               1             mass  IRREGULAR      SPICULATED           5   \n",
       "\n",
       "                   pathology  subtlety  \\\n",
       "0                  MALIGNANT         5   \n",
       "1                  MALIGNANT         5   \n",
       "2                  MALIGNANT         4   \n",
       "3                  MALIGNANT         4   \n",
       "4                     BENIGN         2   \n",
       "..                       ...       ...   \n",
       "372  BENIGN_WITHOUT_CALLBACK         3   \n",
       "373                MALIGNANT         5   \n",
       "374                MALIGNANT         2   \n",
       "375                MALIGNANT         4   \n",
       "376                MALIGNANT         4   \n",
       "\n",
       "                                       image file path  \\\n",
       "0    processed_files/mass_test_full_mammogram_image...   \n",
       "1    processed_files/mass_test_full_mammogram_image...   \n",
       "2    processed_files/mass_test_full_mammogram_image...   \n",
       "3    processed_files/mass_test_full_mammogram_image...   \n",
       "4    processed_files/mass_test_full_mammogram_image...   \n",
       "..                                                 ...   \n",
       "372  processed_files/mass_test_full_mammogram_image...   \n",
       "373  processed_files/mass_test_full_mammogram_image...   \n",
       "374  processed_files/mass_test_full_mammogram_image...   \n",
       "375  processed_files/mass_test_full_mammogram_image...   \n",
       "376  processed_files/mass_test_full_mammogram_image...   \n",
       "\n",
       "                               cropped image file path  \\\n",
       "0    processed_files/mass_test_roi_mask_and_cropped...   \n",
       "1    processed_files/mass_test_roi_mask_and_cropped...   \n",
       "2    processed_files/mass_test_roi_mask_and_cropped...   \n",
       "3    processed_files/mass_test_roi_mask_and_cropped...   \n",
       "4    processed_files/mass_test_roi_mask_and_cropped...   \n",
       "..                                                 ...   \n",
       "372  processed_files/mass_test_roi_mask_and_cropped...   \n",
       "373  processed_files/mass_test_roi_mask_and_cropped...   \n",
       "374  processed_files/mass_test_roi_mask_and_cropped...   \n",
       "375  processed_files/mass_test_roi_mask_and_cropped...   \n",
       "376  processed_files/mass_test_roi_mask_and_cropped...   \n",
       "\n",
       "                                    ROI mask file path  image_exists  \n",
       "0    processed_files/mass_test_roi_mask_and_cropped...          True  \n",
       "1    processed_files/mass_test_roi_mask_and_cropped...          True  \n",
       "2    processed_files/mass_test_roi_mask_and_cropped...          True  \n",
       "3    processed_files/mass_test_roi_mask_and_cropped...          True  \n",
       "4    processed_files/mass_test_roi_mask_and_cropped...          True  \n",
       "..                                                 ...           ...  \n",
       "372  processed_files/mass_test_roi_mask_and_cropped...          True  \n",
       "373  processed_files/mass_test_roi_mask_and_cropped...          True  \n",
       "374  processed_files/mass_test_roi_mask_and_cropped...          True  \n",
       "375  processed_files/mass_test_roi_mask_and_cropped...          True  \n",
       "376  processed_files/mass_test_roi_mask_and_cropped...          True  \n",
       "\n",
       "[377 rows x 16 columns]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "#df['cropped image file path'] = df['cropped image file path'].str.split('/').apply(lambda x: '/'.join(x[1:]))\n",
    "for csv in ['labels_mass_mammograms_test.csv', 'labels_calc_mammograms_test.csv']:\n",
    "    df = pd.read_csv(csv)\n",
    "    df.drop(columns=['Unnamed: 0', 'cropped image file path'], inplace=True)\n",
    "    df['image file path'] = df['image file path'].str.split('/').apply(lambda x: '/'.join(x[1:]))\n",
    "    df['ROI mask file path'] = df['ROI mask file path'].str.split('/').apply(lambda x: '/'.join(x[1:]))\n",
    "    #df.to_csv(csv, index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>patient_id</th>\n",
       "      <th>breast_density</th>\n",
       "      <th>left or right breast</th>\n",
       "      <th>image view</th>\n",
       "      <th>abnormality id</th>\n",
       "      <th>abnormality type</th>\n",
       "      <th>mass shape</th>\n",
       "      <th>mass margins</th>\n",
       "      <th>assessment</th>\n",
       "      <th>pathology</th>\n",
       "      <th>subtlety</th>\n",
       "      <th>image file path</th>\n",
       "      <th>cropped image file path</th>\n",
       "      <th>ROI mask file path</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>P_00001</td>\n",
       "      <td>3</td>\n",
       "      <td>LEFT</td>\n",
       "      <td>CC</td>\n",
       "      <td>1</td>\n",
       "      <td>mass</td>\n",
       "      <td>IRREGULAR-ARCHITECTURAL_DISTORTION</td>\n",
       "      <td>SPICULATED</td>\n",
       "      <td>4</td>\n",
       "      <td>MALIGNANT</td>\n",
       "      <td>4</td>\n",
       "      <td>mass_mammograms/processed_files/1.3.6.1.4.1.95...</td>\n",
       "      <td>roi_mask_and_cropped/processed_files/1.3.6.1.4...</td>\n",
       "      <td>roi_mask_and_cropped/processed_files/1.3.6.1.4...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>P_00001</td>\n",
       "      <td>3</td>\n",
       "      <td>LEFT</td>\n",
       "      <td>MLO</td>\n",
       "      <td>1</td>\n",
       "      <td>mass</td>\n",
       "      <td>IRREGULAR-ARCHITECTURAL_DISTORTION</td>\n",
       "      <td>SPICULATED</td>\n",
       "      <td>4</td>\n",
       "      <td>MALIGNANT</td>\n",
       "      <td>4</td>\n",
       "      <td>mass_mammograms/processed_files/1.3.6.1.4.1.95...</td>\n",
       "      <td>roi_mask_and_cropped/processed_files/1.3.6.1.4...</td>\n",
       "      <td>roi_mask_and_cropped/processed_files/1.3.6.1.4...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>P_00004</td>\n",
       "      <td>3</td>\n",
       "      <td>LEFT</td>\n",
       "      <td>CC</td>\n",
       "      <td>1</td>\n",
       "      <td>mass</td>\n",
       "      <td>ARCHITECTURAL_DISTORTION</td>\n",
       "      <td>ILL_DEFINED</td>\n",
       "      <td>4</td>\n",
       "      <td>BENIGN</td>\n",
       "      <td>3</td>\n",
       "      <td>mass_mammograms/processed_files/1.3.6.1.4.1.95...</td>\n",
       "      <td>roi_mask_and_cropped/processed_files/1.3.6.1.4...</td>\n",
       "      <td>roi_mask_and_cropped/processed_files/1.3.6.1.4...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>P_00004</td>\n",
       "      <td>3</td>\n",
       "      <td>LEFT</td>\n",
       "      <td>MLO</td>\n",
       "      <td>1</td>\n",
       "      <td>mass</td>\n",
       "      <td>ARCHITECTURAL_DISTORTION</td>\n",
       "      <td>ILL_DEFINED</td>\n",
       "      <td>4</td>\n",
       "      <td>BENIGN</td>\n",
       "      <td>3</td>\n",
       "      <td>mass_mammograms/processed_files/1.3.6.1.4.1.95...</td>\n",
       "      <td>roi_mask_and_cropped/processed_files/1.3.6.1.4...</td>\n",
       "      <td>roi_mask_and_cropped/processed_files/1.3.6.1.4...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>P_00004</td>\n",
       "      <td>3</td>\n",
       "      <td>RIGHT</td>\n",
       "      <td>MLO</td>\n",
       "      <td>1</td>\n",
       "      <td>mass</td>\n",
       "      <td>OVAL</td>\n",
       "      <td>CIRCUMSCRIBED</td>\n",
       "      <td>4</td>\n",
       "      <td>BENIGN</td>\n",
       "      <td>5</td>\n",
       "      <td>mass_mammograms/processed_files/1.3.6.1.4.1.95...</td>\n",
       "      <td>roi_mask_and_cropped/processed_files/1.3.6.1.4...</td>\n",
       "      <td>roi_mask_and_cropped/processed_files/1.3.6.1.4...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1313</th>\n",
       "      <td>1313</td>\n",
       "      <td>P_02033</td>\n",
       "      <td>2</td>\n",
       "      <td>RIGHT</td>\n",
       "      <td>MLO</td>\n",
       "      <td>1</td>\n",
       "      <td>mass</td>\n",
       "      <td>IRREGULAR</td>\n",
       "      <td>ILL_DEFINED</td>\n",
       "      <td>3</td>\n",
       "      <td>MALIGNANT</td>\n",
       "      <td>4</td>\n",
       "      <td>mass_mammograms/processed_files/1.3.6.1.4.1.95...</td>\n",
       "      <td>roi_mask_and_cropped/processed_files/1.3.6.1.4...</td>\n",
       "      <td>roi_mask_and_cropped/processed_files/1.3.6.1.4...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1314</th>\n",
       "      <td>1314</td>\n",
       "      <td>P_02079</td>\n",
       "      <td>2</td>\n",
       "      <td>RIGHT</td>\n",
       "      <td>CC</td>\n",
       "      <td>1</td>\n",
       "      <td>mass</td>\n",
       "      <td>ROUND</td>\n",
       "      <td>SPICULATED</td>\n",
       "      <td>3</td>\n",
       "      <td>MALIGNANT</td>\n",
       "      <td>5</td>\n",
       "      <td>mass_mammograms/processed_files/1.3.6.1.4.1.95...</td>\n",
       "      <td>roi_mask_and_cropped/processed_files/1.3.6.1.4...</td>\n",
       "      <td>roi_mask_and_cropped/processed_files/1.3.6.1.4...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1315</th>\n",
       "      <td>1315</td>\n",
       "      <td>P_02079</td>\n",
       "      <td>2</td>\n",
       "      <td>RIGHT</td>\n",
       "      <td>MLO</td>\n",
       "      <td>1</td>\n",
       "      <td>mass</td>\n",
       "      <td>ROUND</td>\n",
       "      <td>SPICULATED</td>\n",
       "      <td>3</td>\n",
       "      <td>MALIGNANT</td>\n",
       "      <td>5</td>\n",
       "      <td>mass_mammograms/processed_files/1.3.6.1.4.1.95...</td>\n",
       "      <td>roi_mask_and_cropped/processed_files/1.3.6.1.4...</td>\n",
       "      <td>roi_mask_and_cropped/processed_files/1.3.6.1.4...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1316</th>\n",
       "      <td>1316</td>\n",
       "      <td>P_02092</td>\n",
       "      <td>2</td>\n",
       "      <td>LEFT</td>\n",
       "      <td>CC</td>\n",
       "      <td>1</td>\n",
       "      <td>mass</td>\n",
       "      <td>IRREGULAR</td>\n",
       "      <td>SPICULATED</td>\n",
       "      <td>3</td>\n",
       "      <td>MALIGNANT</td>\n",
       "      <td>2</td>\n",
       "      <td>mass_mammograms/processed_files/1.3.6.1.4.1.95...</td>\n",
       "      <td>roi_mask_and_cropped/processed_files/1.3.6.1.4...</td>\n",
       "      <td>roi_mask_and_cropped/processed_files/1.3.6.1.4...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1317</th>\n",
       "      <td>1317</td>\n",
       "      <td>P_02092</td>\n",
       "      <td>2</td>\n",
       "      <td>LEFT</td>\n",
       "      <td>MLO</td>\n",
       "      <td>1</td>\n",
       "      <td>mass</td>\n",
       "      <td>IRREGULAR</td>\n",
       "      <td>SPICULATED</td>\n",
       "      <td>3</td>\n",
       "      <td>MALIGNANT</td>\n",
       "      <td>2</td>\n",
       "      <td>mass_mammograms/processed_files/1.3.6.1.4.1.95...</td>\n",
       "      <td>roi_mask_and_cropped/processed_files/1.3.6.1.4...</td>\n",
       "      <td>roi_mask_and_cropped/processed_files/1.3.6.1.4...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1318 rows × 15 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      Unnamed: 0 patient_id  breast_density left or right breast image view  \\\n",
       "0              0    P_00001               3                 LEFT         CC   \n",
       "1              1    P_00001               3                 LEFT        MLO   \n",
       "2              2    P_00004               3                 LEFT         CC   \n",
       "3              3    P_00004               3                 LEFT        MLO   \n",
       "4              4    P_00004               3                RIGHT        MLO   \n",
       "...          ...        ...             ...                  ...        ...   \n",
       "1313        1313    P_02033               2                RIGHT        MLO   \n",
       "1314        1314    P_02079               2                RIGHT         CC   \n",
       "1315        1315    P_02079               2                RIGHT        MLO   \n",
       "1316        1316    P_02092               2                 LEFT         CC   \n",
       "1317        1317    P_02092               2                 LEFT        MLO   \n",
       "\n",
       "      abnormality id abnormality type                          mass shape  \\\n",
       "0                  1             mass  IRREGULAR-ARCHITECTURAL_DISTORTION   \n",
       "1                  1             mass  IRREGULAR-ARCHITECTURAL_DISTORTION   \n",
       "2                  1             mass            ARCHITECTURAL_DISTORTION   \n",
       "3                  1             mass            ARCHITECTURAL_DISTORTION   \n",
       "4                  1             mass                                OVAL   \n",
       "...              ...              ...                                 ...   \n",
       "1313               1             mass                           IRREGULAR   \n",
       "1314               1             mass                               ROUND   \n",
       "1315               1             mass                               ROUND   \n",
       "1316               1             mass                           IRREGULAR   \n",
       "1317               1             mass                           IRREGULAR   \n",
       "\n",
       "       mass margins  assessment  pathology  subtlety  \\\n",
       "0        SPICULATED           4  MALIGNANT         4   \n",
       "1        SPICULATED           4  MALIGNANT         4   \n",
       "2       ILL_DEFINED           4     BENIGN         3   \n",
       "3       ILL_DEFINED           4     BENIGN         3   \n",
       "4     CIRCUMSCRIBED           4     BENIGN         5   \n",
       "...             ...         ...        ...       ...   \n",
       "1313    ILL_DEFINED           3  MALIGNANT         4   \n",
       "1314     SPICULATED           3  MALIGNANT         5   \n",
       "1315     SPICULATED           3  MALIGNANT         5   \n",
       "1316     SPICULATED           3  MALIGNANT         2   \n",
       "1317     SPICULATED           3  MALIGNANT         2   \n",
       "\n",
       "                                        image file path  \\\n",
       "0     mass_mammograms/processed_files/1.3.6.1.4.1.95...   \n",
       "1     mass_mammograms/processed_files/1.3.6.1.4.1.95...   \n",
       "2     mass_mammograms/processed_files/1.3.6.1.4.1.95...   \n",
       "3     mass_mammograms/processed_files/1.3.6.1.4.1.95...   \n",
       "4     mass_mammograms/processed_files/1.3.6.1.4.1.95...   \n",
       "...                                                 ...   \n",
       "1313  mass_mammograms/processed_files/1.3.6.1.4.1.95...   \n",
       "1314  mass_mammograms/processed_files/1.3.6.1.4.1.95...   \n",
       "1315  mass_mammograms/processed_files/1.3.6.1.4.1.95...   \n",
       "1316  mass_mammograms/processed_files/1.3.6.1.4.1.95...   \n",
       "1317  mass_mammograms/processed_files/1.3.6.1.4.1.95...   \n",
       "\n",
       "                                cropped image file path  \\\n",
       "0     roi_mask_and_cropped/processed_files/1.3.6.1.4...   \n",
       "1     roi_mask_and_cropped/processed_files/1.3.6.1.4...   \n",
       "2     roi_mask_and_cropped/processed_files/1.3.6.1.4...   \n",
       "3     roi_mask_and_cropped/processed_files/1.3.6.1.4...   \n",
       "4     roi_mask_and_cropped/processed_files/1.3.6.1.4...   \n",
       "...                                                 ...   \n",
       "1313  roi_mask_and_cropped/processed_files/1.3.6.1.4...   \n",
       "1314  roi_mask_and_cropped/processed_files/1.3.6.1.4...   \n",
       "1315  roi_mask_and_cropped/processed_files/1.3.6.1.4...   \n",
       "1316  roi_mask_and_cropped/processed_files/1.3.6.1.4...   \n",
       "1317  roi_mask_and_cropped/processed_files/1.3.6.1.4...   \n",
       "\n",
       "                                     ROI mask file path  \n",
       "0     roi_mask_and_cropped/processed_files/1.3.6.1.4...  \n",
       "1     roi_mask_and_cropped/processed_files/1.3.6.1.4...  \n",
       "2     roi_mask_and_cropped/processed_files/1.3.6.1.4...  \n",
       "3     roi_mask_and_cropped/processed_files/1.3.6.1.4...  \n",
       "4     roi_mask_and_cropped/processed_files/1.3.6.1.4...  \n",
       "...                                                 ...  \n",
       "1313  roi_mask_and_cropped/processed_files/1.3.6.1.4...  \n",
       "1314  roi_mask_and_cropped/processed_files/1.3.6.1.4...  \n",
       "1315  roi_mask_and_cropped/processed_files/1.3.6.1.4...  \n",
       "1316  roi_mask_and_cropped/processed_files/1.3.6.1.4...  \n",
       "1317  roi_mask_and_cropped/processed_files/1.3.6.1.4...  \n",
       "\n",
       "[1318 rows x 15 columns]"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('labels_mass_mammograms.csv')\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'mass_mammograms/processed_files/1.3.6.1.4.1.9590.100.1.2.342386194811267636608694132590482924515.png'"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['image file path'].iloc[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1202385 4116106.8127425616 calc_train_roi-mask_and_cropped_image/1.3.6.1.4.1.9590.100.1.2.323173986211744534717152497940376023803\n",
      "Likely no mask for image: calc_train_roi-mask_and_cropped_image/1.3.6.1.4.1.9590.100.1.2.323173986211744534717152497940376023803\n",
      "ROI mask file path\n",
      "False    1545\n",
      "Name: count, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "from PIL import Image\n",
    "import os \n",
    "\n",
    "df = pd.read_csv('labels_calc_mammograms.csv')\n",
    "df['image file path'] = df['image file path'].str.split('/').apply(lambda x: 'calc_train_full_mammograms/' + x[-1])\n",
    "df['cropped image file path'] = df['cropped image file path'].str.split('/').apply(lambda x: 'calc_train_roi-mask_and_cropped_image/' + x[-1])\n",
    "df['ROI mask file path'] = df['ROI mask file path'].str.split('/').apply(lambda x: 'calc_train_roi-mask_and_cropped_image/' + x[-1])\n",
    "\n",
    "def check_image_pixel_count(path):\n",
    "    width, height = Image.open('images/' + path).size\n",
    "    return width * height\n",
    "\n",
    "threshold = (sum(df['image file path'].apply(check_image_pixel_count))) / len(df) * 0.25\n",
    "\n",
    "\n",
    "def map_to_mask(path, is_mask, prefix):\n",
    "    \n",
    "    path = prefix + path \n",
    "    # print(path)\n",
    "    # print(check_image_pixel_count(path))\n",
    "    #print(path[-7:])\n",
    "    #print('b', path)\n",
    "    path = path[:-8] \n",
    "    # images higher then avg are masks\n",
    "    #print('a', path)\n",
    "    pixel_count = check_image_pixel_count(path + '-1-1.png')\n",
    "    if os.path.exists('images/' + path + '-1-2.png'): # we assume that if 1-2.png exists, one of them could be cropped. There should always be a mask\n",
    "        if pixel_count < threshold and is_mask:\n",
    "            return path + '-1-2.png'\n",
    "        elif pixel_count > threshold and not is_mask:\n",
    "            return path + '-1-2.png'\n",
    "        else:\n",
    "            return path + '-1-1.png'\n",
    "    else:\n",
    "        if pixel_count > threshold and is_mask:\n",
    "            return path + '-1-1.png'\n",
    "        elif not is_mask:\n",
    "            print('no cropped image for: ' + path)\n",
    "            return None\n",
    "        else:\n",
    "            print(pixel_count, threshold, path)\n",
    "            print('Likely no mask for image: ' + path)\n",
    "            return pd.NA\n",
    "\n",
    "df['ROI mask file path'] = df['ROI mask file path'].apply(map_to_mask, args=(True, ''))\n",
    "\n",
    "\n",
    "def check_if_under_threshold(path):\n",
    "    return check_image_pixel_count(path) < threshold\n",
    "\n",
    "# print(df['cropped image file path'].apply(check_if_under_threshold).value_counts()) # this should be all True\n",
    "df.drop(['cropped image file path'], axis=1, inplace=True)\n",
    "df.dropna(subset=['ROI mask file path'], inplace=True)\n",
    "print(df['ROI mask file path'].apply(check_if_under_threshold).value_counts()) # this should be all False\n",
    "df.to_csv('labels_calc_mammograms.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv('liver_dataset.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0.1</th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>train</th>\n",
       "      <th>slice_id</th>\n",
       "      <th>image</th>\n",
       "      <th>mask</th>\n",
       "      <th>liver</th>\n",
       "      <th>cancer</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>26529</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>liver_97_0.npy</td>\n",
       "      <td>Task03_Liver/imagesTr/processed_images/liver_9...</td>\n",
       "      <td>Task03_Liver/labelsTr/processed_images/liver_9...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>26530</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>liver_97_1.npy</td>\n",
       "      <td>Task03_Liver/imagesTr/processed_images/liver_9...</td>\n",
       "      <td>Task03_Liver/labelsTr/processed_images/liver_9...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>26531</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>liver_97_2.npy</td>\n",
       "      <td>Task03_Liver/imagesTr/processed_images/liver_9...</td>\n",
       "      <td>Task03_Liver/labelsTr/processed_images/liver_9...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>26532</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>liver_97_3.npy</td>\n",
       "      <td>Task03_Liver/imagesTr/processed_images/liver_9...</td>\n",
       "      <td>Task03_Liver/labelsTr/processed_images/liver_9...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>26533</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>liver_97_4.npy</td>\n",
       "      <td>Task03_Liver/imagesTr/processed_images/liver_9...</td>\n",
       "      <td>Task03_Liver/labelsTr/processed_images/liver_9...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>58633</th>\n",
       "      <td>85162</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>liver_10_496.npy</td>\n",
       "      <td>Task03_Liver/imagesTr/processed_images/liver_1...</td>\n",
       "      <td>Task03_Liver/labelsTr/processed_images/liver_1...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>58634</th>\n",
       "      <td>85163</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>liver_10_497.npy</td>\n",
       "      <td>Task03_Liver/imagesTr/processed_images/liver_1...</td>\n",
       "      <td>Task03_Liver/labelsTr/processed_images/liver_1...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>58635</th>\n",
       "      <td>85164</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>liver_10_498.npy</td>\n",
       "      <td>Task03_Liver/imagesTr/processed_images/liver_1...</td>\n",
       "      <td>Task03_Liver/labelsTr/processed_images/liver_1...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>58636</th>\n",
       "      <td>85165</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>liver_10_499.npy</td>\n",
       "      <td>Task03_Liver/imagesTr/processed_images/liver_1...</td>\n",
       "      <td>Task03_Liver/labelsTr/processed_images/liver_1...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>58637</th>\n",
       "      <td>85166</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>liver_10_500.npy</td>\n",
       "      <td>Task03_Liver/imagesTr/processed_images/liver_1...</td>\n",
       "      <td>Task03_Liver/labelsTr/processed_images/liver_1...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>58638 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       Unnamed: 0.1  Unnamed: 0  train          slice_id  \\\n",
       "0             26529           0      1    liver_97_0.npy   \n",
       "1             26530           0      1    liver_97_1.npy   \n",
       "2             26531           0      1    liver_97_2.npy   \n",
       "3             26532           0      1    liver_97_3.npy   \n",
       "4             26533           0      1    liver_97_4.npy   \n",
       "...             ...         ...    ...               ...   \n",
       "58633         85162           0      1  liver_10_496.npy   \n",
       "58634         85163           0      1  liver_10_497.npy   \n",
       "58635         85164           0      1  liver_10_498.npy   \n",
       "58636         85165           0      1  liver_10_499.npy   \n",
       "58637         85166           0      1  liver_10_500.npy   \n",
       "\n",
       "                                                   image  \\\n",
       "0      Task03_Liver/imagesTr/processed_images/liver_9...   \n",
       "1      Task03_Liver/imagesTr/processed_images/liver_9...   \n",
       "2      Task03_Liver/imagesTr/processed_images/liver_9...   \n",
       "3      Task03_Liver/imagesTr/processed_images/liver_9...   \n",
       "4      Task03_Liver/imagesTr/processed_images/liver_9...   \n",
       "...                                                  ...   \n",
       "58633  Task03_Liver/imagesTr/processed_images/liver_1...   \n",
       "58634  Task03_Liver/imagesTr/processed_images/liver_1...   \n",
       "58635  Task03_Liver/imagesTr/processed_images/liver_1...   \n",
       "58636  Task03_Liver/imagesTr/processed_images/liver_1...   \n",
       "58637  Task03_Liver/imagesTr/processed_images/liver_1...   \n",
       "\n",
       "                                                    mask  liver  cancer  \n",
       "0      Task03_Liver/labelsTr/processed_images/liver_9...    0.0     0.0  \n",
       "1      Task03_Liver/labelsTr/processed_images/liver_9...    0.0     0.0  \n",
       "2      Task03_Liver/labelsTr/processed_images/liver_9...    0.0     0.0  \n",
       "3      Task03_Liver/labelsTr/processed_images/liver_9...    0.0     0.0  \n",
       "4      Task03_Liver/labelsTr/processed_images/liver_9...    0.0     0.0  \n",
       "...                                                  ...    ...     ...  \n",
       "58633  Task03_Liver/labelsTr/processed_images/liver_1...    0.0     0.0  \n",
       "58634  Task03_Liver/labelsTr/processed_images/liver_1...    0.0     0.0  \n",
       "58635  Task03_Liver/labelsTr/processed_images/liver_1...    0.0     0.0  \n",
       "58636  Task03_Liver/labelsTr/processed_images/liver_1...    0.0     0.0  \n",
       "58637  Task03_Liver/labelsTr/processed_images/liver_1...    0.0     0.0  \n",
       "\n",
       "[58638 rows x 8 columns]"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "data['image'] = data['image'].str.split('/').apply(lambda x: 'liver_images/' + x[-1])\n",
    "data['mask'] = data['mask'].str.split('/').apply(lambda x: 'liver_images/' + x[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.to_csv('liver_dataset.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "mask\n",
       "labelsTr    58638\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data['mask'].str.split('/').apply(lambda x: x[1]).value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# input_csv = 'mass_case_description_test_set.csv'\n",
    "# output_csv = 'labels_mass_mammograms_test.csv'\n",
    "# prefix_full = 'processed_files/mass_test_full_mammogram_image/'\n",
    "# prefix_mask = 'processed_files/mass_test_roi_mask_and_cropped_image/'\n",
    "\n",
    "input_csv = 'calc_case_description_test_set.csv'\n",
    "output_csv = 'labels_calc_mammograms_test.csv'\n",
    "prefix_full = 'processed_files/calc_test_full_mammograms/'\n",
    "prefix_mask = 'processed_files/calc_test_roi-mask_and_cropped_image/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from PIL import Image\n",
    "import os \n",
    "\n",
    "labels = pd.read_csv(input_csv)\n",
    "\n",
    "labels['image file path'] = labels['image file path'].str.split('/').str[-2] + '/1-1.png'\n",
    "labels['image file path'] = prefix_full + labels['image file path']\n",
    "\n",
    "labels['cropped image file path'] = labels['cropped image file path'].str.split('/').str[-2]\n",
    "labels['ROI mask file path'] = labels['ROI mask file path'].str.split('/').str[-2]\n",
    "\n",
    "labels['image_exists'] = labels['image file path'].apply(os.path.exists)\n",
    "labels = labels[labels['image_exists'] == True]\n",
    "\n",
    "def check_image_pixel_count(path):\n",
    "    width, height = Image.open(path).size\n",
    "    return width * height\n",
    "\n",
    "threshold = (sum(labels['image file path'].apply(check_image_pixel_count)) / len(labels)) * 0.25\n",
    "\n",
    "def map_to_mask(path, is_mask, prefix):\n",
    "    \n",
    "    path = prefix + path \n",
    "    # print(path)\n",
    "    # print(check_image_pixel_count(path))\n",
    "    #print(path[-7:])\n",
    "    path = path\n",
    "    # images higher then avg are masks\n",
    "    pixel_count = check_image_pixel_count(path + '/1-1.png')\n",
    "    if os.path.exists(path + '/1-2.png'): # we assume that if 1-2.png exists, one of them could be cropped. There should always be a mask\n",
    "        \n",
    "        if pixel_count < threshold and is_mask:\n",
    "            return path + '/1-2.png'\n",
    "        elif pixel_count > threshold and not is_mask:\n",
    "            return path + '/1-2.png'\n",
    "        else:\n",
    "            return path + '/1-1.png'\n",
    "    else:\n",
    "        if pixel_count > threshold and is_mask:\n",
    "            return path + '/1-1.png'\n",
    "        elif not is_mask:\n",
    "            print('no cropped image for: ' + path)\n",
    "            return None\n",
    "        else:\n",
    "            print(pixel_count, threshold, path)\n",
    "            raise Exception('Likely no mask for image: ' + path)\n",
    "    \n",
    "labels['ROI mask file path'] = labels['ROI mask file path'].apply(map_to_mask, is_mask=True, prefix=prefix_mask)\n",
    "labels['cropped image file path'] = labels['cropped image file path'].apply(map_to_mask, is_mask=False, prefix=prefix_mask)\n",
    "labels.to_csv(output_csv)\n",
    "labels[labels['ROI mask file path'] == labels['cropped image file path']] # check if there are any images that are the same, should be empty"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for csv in ['labels_mass_mammograms_test.csv', 'labels_calc_mammograms_test.csv']:\n",
    "    df = pd.read_csv(csv)\n",
    "    df.drop(columns=['Unnamed: 0', 'cropped image file path'], inplace=True)\n",
    "    df['image file path'] = df['image file path'].str.split('/').apply(lambda x: '/'.join(x[1:]))\n",
    "    df['ROI mask file path'] = df['ROI mask file path'].str.split('/').apply(lambda x: '/'.join(x[1:]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Unnamed: 0', 'patient_id', 'breast_density', 'left or right breast',\n",
       "       'image view', 'abnormality id', 'abnormality type', 'mass shape',\n",
       "       'mass margins', 'assessment', 'pathology', 'subtlety',\n",
       "       'image file path', 'cropped image file path', 'ROI mask file path'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.columns"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
