{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'uci_id': 31, 'name': 'Covertype', 'repository_url': 'https://archive.ics.uci.edu/dataset/31/covertype', 'data_url': 'https://archive.ics.uci.edu/static/public/31/data.csv', 'abstract': 'Classification of pixels into 7 forest cover types based on attributes such as elevation, aspect, slope, hillshade, soil-type, and more.', 'area': 'Biology', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 581012, 'num_features': 54, 'feature_types': ['Categorical', 'Integer'], 'demographics': [], 'target_col': ['Cover_Type'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1998, 'last_updated': 'Sat Mar 16 2024', 'dataset_doi': '10.24432/C50K5N', 'creators': ['Jock Blackard'], 'intro_paper': None, 'additional_info': {'summary': 'Predicting forest cover type from cartographic variables only (no remotely sensed data).  The actual forest cover type for a given observation (30 x 30 meter cell) was determined from US Forest Service (USFS) Region 2 Resource Information System (RIS) data.  Independent variables were derived from data originally obtained from US Geological Survey (USGS) and USFS data.  Data is in raw form (not scaled) and contains binary (0 or 1) columns of data for qualitative independent variables (wilderness areas and soil types).\\r\\n\\r\\nThis study area includes four wilderness areas located in the Roosevelt National Forest of northern Colorado.  These areas represent forests with minimal human-caused disturbances, so that existing forest cover types are more a result of ecological processes rather than forest management practices.\\r\\n\\r\\nSome background information for these four wilderness areas: Neota (area 2) probably has the highest mean elevational value of the 4 wilderness areas. Rawah (area 1) and Comanche Peak (area 3) would have a lower mean elevational value, while Cache la Poudre (area 4) would have the lowest mean elevational value. \\r\\n\\r\\nAs for primary major tree species in these areas, Neota would have spruce/fir (type 1), while Rawah and Comanche Peak would probably have lodgepole pine (type 2) as their primary species, followed by spruce/fir and aspen (type 5). Cache la Poudre would tend to have Ponderosa pine (type 3), Douglas-fir (type 6), and cottonwood/willow (type 4).  \\r\\n\\r\\nThe Rawah and Comanche Peak areas would tend to be more typical of the overall dataset than either the Neota or Cache la Poudre, due to their assortment of tree species and range of predictive variable values (elevation, etc.)  Cache la Poudre would probably  be more unique than the others, due to its relatively low  elevation range and species composition. ', 'purpose': None, 'funded_by': None, 'instances_represent': None, 'recommended_data_splits': None, 'sensitive_data': None, 'preprocessing_description': None, 'variable_info': 'Given is the attribute name, attribute type, the measurement unit and a brief description.  The forest cover type is the classification  problem.  The order of this listing corresponds to the order of numerals along the rows of the database.\\r\\n\\r\\nName / Data Type / Measurement / Description\\r\\n\\r\\nElevation / quantitative /meters / Elevation in meters\\r\\nAspect / quantitative / azimuth / Aspect in degrees azimuth\\r\\nSlope / quantitative / degrees / Slope in degrees\\r\\nHorizontal_Distance_To_Hydrology / quantitative / meters / Horz Dist to nearest surface water features\\r\\nVertical_Distance_To_Hydrology / quantitative / meters / Vert Dist to nearest surface water features\\r\\nHorizontal_Distance_To_Roadways / quantitative / meters / Horz Dist to nearest roadway\\r\\nHillshade_9am / quantitative / 0 to 255 index / Hillshade index at 9am, summer solstice\\r\\nHillshade_Noon / quantitative / 0 to 255 index / Hillshade index at noon, summer soltice\\r\\nHillshade_3pm / quantitative / 0 to 255 index / Hillshade index at 3pm, summer solstice\\r\\nHorizontal_Distance_To_Fire_Points / quantitative / meters / Horz Dist to nearest wildfire ignition points\\r\\nWilderness_Area (4 binary columns) / qualitative / 0 (absence) or 1 (presence) / Wilderness area designation\\r\\nSoil_Type (40 binary columns) / qualitative / 0 (absence) or 1 (presence) / Soil Type designation\\r\\nCover_Type (7 types) / integer / 1 to 7 / Forest Cover Type designation', 'citation': None}}\n",
      "                                  name     role     type demographic  \\\n",
      "0                            Elevation  Feature  Integer        None   \n",
      "1                               Aspect  Feature  Integer        None   \n",
      "2                                Slope  Feature  Integer        None   \n",
      "3     Horizontal_Distance_To_Hydrology  Feature  Integer        None   \n",
      "4       Vertical_Distance_To_Hydrology  Feature  Integer        None   \n",
      "5      Horizontal_Distance_To_Roadways  Feature  Integer        None   \n",
      "6                        Hillshade_9am  Feature  Integer        None   \n",
      "7                       Hillshade_Noon  Feature  Integer        None   \n",
      "8                        Hillshade_3pm  Feature  Integer        None   \n",
      "9   Horizontal_Distance_To_Fire_Points  Feature  Integer        None   \n",
      "10                    Wilderness_Area1  Feature  Integer        None   \n",
      "11                          Soil_Type1  Feature  Integer        None   \n",
      "12                          Soil_Type2  Feature  Integer        None   \n",
      "13                          Soil_Type3  Feature  Integer        None   \n",
      "14                          Soil_Type4  Feature  Integer        None   \n",
      "15                          Soil_Type5  Feature  Integer        None   \n",
      "16                          Soil_Type6  Feature  Integer        None   \n",
      "17                          Soil_Type7  Feature  Integer        None   \n",
      "18                          Soil_Type8  Feature  Integer        None   \n",
      "19                          Soil_Type9  Feature  Integer        None   \n",
      "20                         Soil_Type10  Feature  Integer        None   \n",
      "21                         Soil_Type11  Feature  Integer        None   \n",
      "22                         Soil_Type12  Feature  Integer        None   \n",
      "23                         Soil_Type13  Feature  Integer        None   \n",
      "24                         Soil_Type14  Feature  Integer        None   \n",
      "25                         Soil_Type15  Feature  Integer        None   \n",
      "26                         Soil_Type16  Feature  Integer        None   \n",
      "27                         Soil_Type17  Feature  Integer        None   \n",
      "28                         Soil_Type18  Feature  Integer        None   \n",
      "29                         Soil_Type19  Feature  Integer        None   \n",
      "30                         Soil_Type20  Feature  Integer        None   \n",
      "31                         Soil_Type21  Feature  Integer        None   \n",
      "32                         Soil_Type22  Feature  Integer        None   \n",
      "33                         Soil_Type23  Feature  Integer        None   \n",
      "34                         Soil_Type24  Feature  Integer        None   \n",
      "35                         Soil_Type25  Feature  Integer        None   \n",
      "36                         Soil_Type26  Feature  Integer        None   \n",
      "37                         Soil_Type27  Feature  Integer        None   \n",
      "38                         Soil_Type28  Feature  Integer        None   \n",
      "39                         Soil_Type29  Feature  Integer        None   \n",
      "40                         Soil_Type30  Feature  Integer        None   \n",
      "41                         Soil_Type31  Feature  Integer        None   \n",
      "42                         Soil_Type32  Feature  Integer        None   \n",
      "43                         Soil_Type33  Feature  Integer        None   \n",
      "44                         Soil_Type34  Feature  Integer        None   \n",
      "45                         Soil_Type35  Feature  Integer        None   \n",
      "46                         Soil_Type36  Feature  Integer        None   \n",
      "47                         Soil_Type37  Feature  Integer        None   \n",
      "48                         Soil_Type38  Feature  Integer        None   \n",
      "49                         Soil_Type39  Feature  Integer        None   \n",
      "50                         Soil_Type40  Feature  Integer        None   \n",
      "51                          Cover_Type   Target  Integer        None   \n",
      "52                    Wilderness_Area2  Feature  Integer        None   \n",
      "53                    Wilderness_Area3  Feature  Integer        None   \n",
      "54                    Wilderness_Area4  Feature  Integer        None   \n",
      "\n",
      "   description units missing_values  \n",
      "0         None  None             no  \n",
      "1         None  None             no  \n",
      "2         None  None             no  \n",
      "3         None  None             no  \n",
      "4         None  None             no  \n",
      "5         None  None             no  \n",
      "6         None  None             no  \n",
      "7         None  None             no  \n",
      "8         None  None             no  \n",
      "9         None  None             no  \n",
      "10        None  None             no  \n",
      "11        None  None             no  \n",
      "12        None  None             no  \n",
      "13        None  None             no  \n",
      "14        None  None             no  \n",
      "15        None  None             no  \n",
      "16        None  None             no  \n",
      "17        None  None             no  \n",
      "18        None  None             no  \n",
      "19        None  None             no  \n",
      "20        None  None             no  \n",
      "21        None  None             no  \n",
      "22        None  None             no  \n",
      "23        None  None             no  \n",
      "24        None  None             no  \n",
      "25        None  None             no  \n",
      "26        None  None             no  \n",
      "27        None  None             no  \n",
      "28        None  None             no  \n",
      "29        None  None             no  \n",
      "30        None  None             no  \n",
      "31        None  None             no  \n",
      "32        None  None             no  \n",
      "33        None  None             no  \n",
      "34        None  None             no  \n",
      "35        None  None             no  \n",
      "36        None  None             no  \n",
      "37        None  None             no  \n",
      "38        None  None             no  \n",
      "39        None  None             no  \n",
      "40        None  None             no  \n",
      "41        None  None             no  \n",
      "42        None  None             no  \n",
      "43        None  None             no  \n",
      "44        None  None             no  \n",
      "45        None  None             no  \n",
      "46        None  None             no  \n",
      "47        None  None             no  \n",
      "48        None  None             no  \n",
      "49        None  None             no  \n",
      "50        None  None             no  \n",
      "51        None  None             no  \n",
      "52        None  None             no  \n",
      "53        None  None             no  \n",
      "54        None  None             no  \n"
     ]
    }
   ],
   "source": [
    "from ucimlrepo import fetch_ucirepo \n",
    "  \n",
    "# fetch dataset \n",
    "covertype = fetch_ucirepo(id=31) \n",
    "  \n",
    "# data (as pandas dataframes) \n",
    "X = covertype.data.features \n",
    "y = covertype.data.targets \n",
    "  \n",
    "# metadata \n",
    "print(covertype.metadata) \n",
    "  \n",
    "# variable information \n",
    "print(covertype.variables) \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = pd.concat((X,y), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',\n",
       "       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',\n",
       "       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',\n",
       "       'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area1', 'Soil_Type1',\n",
       "       'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5', 'Soil_Type6',\n",
       "       'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10', 'Soil_Type11',\n",
       "       'Soil_Type12', 'Soil_Type13', 'Soil_Type14', 'Soil_Type15',\n",
       "       'Soil_Type16', 'Soil_Type17', 'Soil_Type18', 'Soil_Type19',\n",
       "       'Soil_Type20', 'Soil_Type21', 'Soil_Type22', 'Soil_Type23',\n",
       "       'Soil_Type24', 'Soil_Type25', 'Soil_Type26', 'Soil_Type27',\n",
       "       'Soil_Type28', 'Soil_Type29', 'Soil_Type30', 'Soil_Type31',\n",
       "       'Soil_Type32', 'Soil_Type33', 'Soil_Type34', 'Soil_Type35',\n",
       "       'Soil_Type36', 'Soil_Type37', 'Soil_Type38', 'Soil_Type39',\n",
       "       'Soil_Type40', 'Wilderness_Area2', 'Wilderness_Area3',\n",
       "       'Wilderness_Area4', 'Cover_Type'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Area\n",
       "0.0    260796\n",
       "2.0    253364\n",
       "3.0     36968\n",
       "1.0     29884\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.loc[X.Wilderness_Area1 == 1, 'Area'] = 0\n",
    "X.loc[X.Wilderness_Area2 == 1, 'Area'] = 1\n",
    "X.loc[X.Wilderness_Area3 == 1, 'Area'] = 2\n",
    "X.loc[X.Wilderness_Area4 == 1, 'Area'] = 3\n",
    "X.Area.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Elevation</th>\n",
       "      <th>Aspect</th>\n",
       "      <th>Slope</th>\n",
       "      <th>Horizontal_Distance_To_Hydrology</th>\n",
       "      <th>Vertical_Distance_To_Hydrology</th>\n",
       "      <th>Horizontal_Distance_To_Roadways</th>\n",
       "      <th>Hillshade_9am</th>\n",
       "      <th>Hillshade_Noon</th>\n",
       "      <th>Hillshade_3pm</th>\n",
       "      <th>Horizontal_Distance_To_Fire_Points</th>\n",
       "      <th>...</th>\n",
       "      <th>Soil_Type33</th>\n",
       "      <th>Soil_Type34</th>\n",
       "      <th>Soil_Type35</th>\n",
       "      <th>Soil_Type36</th>\n",
       "      <th>Soil_Type37</th>\n",
       "      <th>Soil_Type38</th>\n",
       "      <th>Soil_Type39</th>\n",
       "      <th>Soil_Type40</th>\n",
       "      <th>Cover_Type</th>\n",
       "      <th>Area</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2596</td>\n",
       "      <td>51</td>\n",
       "      <td>3</td>\n",
       "      <td>258</td>\n",
       "      <td>0</td>\n",
       "      <td>510</td>\n",
       "      <td>221</td>\n",
       "      <td>232</td>\n",
       "      <td>148</td>\n",
       "      <td>6279</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2590</td>\n",
       "      <td>56</td>\n",
       "      <td>2</td>\n",
       "      <td>212</td>\n",
       "      <td>-6</td>\n",
       "      <td>390</td>\n",
       "      <td>220</td>\n",
       "      <td>235</td>\n",
       "      <td>151</td>\n",
       "      <td>6225</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2804</td>\n",
       "      <td>139</td>\n",
       "      <td>9</td>\n",
       "      <td>268</td>\n",
       "      <td>65</td>\n",
       "      <td>3180</td>\n",
       "      <td>234</td>\n",
       "      <td>238</td>\n",
       "      <td>135</td>\n",
       "      <td>6121</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2785</td>\n",
       "      <td>155</td>\n",
       "      <td>18</td>\n",
       "      <td>242</td>\n",
       "      <td>118</td>\n",
       "      <td>3090</td>\n",
       "      <td>238</td>\n",
       "      <td>238</td>\n",
       "      <td>122</td>\n",
       "      <td>6211</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2595</td>\n",
       "      <td>45</td>\n",
       "      <td>2</td>\n",
       "      <td>153</td>\n",
       "      <td>-1</td>\n",
       "      <td>391</td>\n",
       "      <td>220</td>\n",
       "      <td>234</td>\n",
       "      <td>150</td>\n",
       "      <td>6172</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>581007</th>\n",
       "      <td>2396</td>\n",
       "      <td>153</td>\n",
       "      <td>20</td>\n",
       "      <td>85</td>\n",
       "      <td>17</td>\n",
       "      <td>108</td>\n",
       "      <td>240</td>\n",
       "      <td>237</td>\n",
       "      <td>118</td>\n",
       "      <td>837</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>581008</th>\n",
       "      <td>2391</td>\n",
       "      <td>152</td>\n",
       "      <td>19</td>\n",
       "      <td>67</td>\n",
       "      <td>12</td>\n",
       "      <td>95</td>\n",
       "      <td>240</td>\n",
       "      <td>237</td>\n",
       "      <td>119</td>\n",
       "      <td>845</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>581009</th>\n",
       "      <td>2386</td>\n",
       "      <td>159</td>\n",
       "      <td>17</td>\n",
       "      <td>60</td>\n",
       "      <td>7</td>\n",
       "      <td>90</td>\n",
       "      <td>236</td>\n",
       "      <td>241</td>\n",
       "      <td>130</td>\n",
       "      <td>854</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>581010</th>\n",
       "      <td>2384</td>\n",
       "      <td>170</td>\n",
       "      <td>15</td>\n",
       "      <td>60</td>\n",
       "      <td>5</td>\n",
       "      <td>90</td>\n",
       "      <td>230</td>\n",
       "      <td>245</td>\n",
       "      <td>143</td>\n",
       "      <td>864</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>581011</th>\n",
       "      <td>2383</td>\n",
       "      <td>165</td>\n",
       "      <td>13</td>\n",
       "      <td>60</td>\n",
       "      <td>4</td>\n",
       "      <td>67</td>\n",
       "      <td>231</td>\n",
       "      <td>244</td>\n",
       "      <td>141</td>\n",
       "      <td>875</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>581012 rows × 52 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        Elevation  Aspect  Slope  Horizontal_Distance_To_Hydrology  \\\n",
       "0            2596      51      3                               258   \n",
       "1            2590      56      2                               212   \n",
       "2            2804     139      9                               268   \n",
       "3            2785     155     18                               242   \n",
       "4            2595      45      2                               153   \n",
       "...           ...     ...    ...                               ...   \n",
       "581007       2396     153     20                                85   \n",
       "581008       2391     152     19                                67   \n",
       "581009       2386     159     17                                60   \n",
       "581010       2384     170     15                                60   \n",
       "581011       2383     165     13                                60   \n",
       "\n",
       "        Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \\\n",
       "0                                    0                              510   \n",
       "1                                   -6                              390   \n",
       "2                                   65                             3180   \n",
       "3                                  118                             3090   \n",
       "4                                   -1                              391   \n",
       "...                                ...                              ...   \n",
       "581007                              17                              108   \n",
       "581008                              12                               95   \n",
       "581009                               7                               90   \n",
       "581010                               5                               90   \n",
       "581011                               4                               67   \n",
       "\n",
       "        Hillshade_9am  Hillshade_Noon  Hillshade_3pm  \\\n",
       "0                 221             232            148   \n",
       "1                 220             235            151   \n",
       "2                 234             238            135   \n",
       "3                 238             238            122   \n",
       "4                 220             234            150   \n",
       "...               ...             ...            ...   \n",
       "581007            240             237            118   \n",
       "581008            240             237            119   \n",
       "581009            236             241            130   \n",
       "581010            230             245            143   \n",
       "581011            231             244            141   \n",
       "\n",
       "        Horizontal_Distance_To_Fire_Points  ...  Soil_Type33  Soil_Type34  \\\n",
       "0                                     6279  ...            0            0   \n",
       "1                                     6225  ...            0            0   \n",
       "2                                     6121  ...            0            0   \n",
       "3                                     6211  ...            0            0   \n",
       "4                                     6172  ...            0            0   \n",
       "...                                    ...  ...          ...          ...   \n",
       "581007                                 837  ...            0            0   \n",
       "581008                                 845  ...            0            0   \n",
       "581009                                 854  ...            0            0   \n",
       "581010                                 864  ...            0            0   \n",
       "581011                                 875  ...            0            0   \n",
       "\n",
       "        Soil_Type35  Soil_Type36  Soil_Type37  Soil_Type38  Soil_Type39  \\\n",
       "0                 0            0            0            0            0   \n",
       "1                 0            0            0            0            0   \n",
       "2                 0            0            0            0            0   \n",
       "3                 0            0            0            0            0   \n",
       "4                 0            0            0            0            0   \n",
       "...             ...          ...          ...          ...          ...   \n",
       "581007            0            0            0            0            0   \n",
       "581008            0            0            0            0            0   \n",
       "581009            0            0            0            0            0   \n",
       "581010            0            0            0            0            0   \n",
       "581011            0            0            0            0            0   \n",
       "\n",
       "        Soil_Type40  Cover_Type  Area  \n",
       "0                 0           5   0.0  \n",
       "1                 0           5   0.0  \n",
       "2                 0           2   0.0  \n",
       "3                 0           2   0.0  \n",
       "4                 0           5   0.0  \n",
       "...             ...         ...   ...  \n",
       "581007            0           3   2.0  \n",
       "581008            0           3   2.0  \n",
       "581009            0           3   2.0  \n",
       "581010            0           3   2.0  \n",
       "581011            0           3   2.0  \n",
       "\n",
       "[581012 rows x 52 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X = X.drop(columns=['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4'])\n",
    "X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Area\n",
       "0.0    260796\n",
       "2.0    253364\n",
       "3.0     36968\n",
       "1.0     29884\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.Area.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_data = np.array(X)\n",
    "all_data = (all_data - all_data.min(axis=0)) / (all_data.max(axis=0) - all_data.min(axis=0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_index_1 = np.where(all_data[:, -1] == 0)[0]\n",
    "train_index_1 = np.random.choice(train_index_1, size=50000, replace=False)\n",
    "\n",
    "train_index_2 = np.where(all_data[:, -1] == 2/3)[0]\n",
    "train_index_2 = np.random.choice(train_index_2, size=50000, replace=False)\n",
    "\n",
    "train_index_3 = np.where(all_data[:, -1] == 1)[0]\n",
    "train_index_3 = np.random.choice(train_index_3, size=30000, replace=False)\n",
    "\n",
    "test_index = np.where(all_data[:, -1] == 1/3)[0]\n",
    "\n",
    "train_index = np.concatenate((train_index_1, train_index_2, train_index_3))\n",
    "\n",
    "train_data = all_data[train_index]\n",
    "test_data = all_data[test_index]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.save('data/cover_train31.npy', train_data)\n",
    "np.save('data/cover_test31.npy', test_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "50000"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.sum(train_data[:, -1] == 0/3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(37430, 55221, 20339, 2186, 1867, 9431, 3526)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.sum(train_data[:, -2] == 0/6), \\\n",
    "np.sum(train_data[:, -2] == 1/6), \\\n",
    "np.sum(train_data[:, -2] == 2/6), \\\n",
    "np.sum(train_data[:, -2] == 3/6), \\\n",
    "np.sum(train_data[:, -2] == 4/6), \\\n",
    "np.sum(train_data[:, -2] == 5/6), \\\n",
    "np.sum(train_data[:, -2] == 6/6)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "Cannot take a larger sample than population when 'replace=False'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[13], line 11\u001b[0m\n\u001b[1;32m      8\u001b[0m train_index_3 \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mchoice(train_index_3, size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m7500\u001b[39m, replace\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m     10\u001b[0m train_index_5 \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mwhere(train_data[:, \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m2\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m4\u001b[39m\u001b[38;5;241m/\u001b[39m\u001b[38;5;241m6\u001b[39m)[\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m---> 11\u001b[0m train_index_5 \u001b[38;5;241m=\u001b[39m \u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrandom\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mchoice\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtrain_index_5\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msize\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m7500\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreplace\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m     13\u001b[0m train_index_6 \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mwhere(train_data[:, \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m2\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m5\u001b[39m\u001b[38;5;241m/\u001b[39m\u001b[38;5;241m6\u001b[39m)[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m     14\u001b[0m train_index_6 \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mchoice(train_index_6, size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m7500\u001b[39m, replace\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n",
      "File \u001b[0;32mmtrand.pyx:984\u001b[0m, in \u001b[0;36mnumpy.random.mtrand.RandomState.choice\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;31mValueError\u001b[0m: Cannot take a larger sample than population when 'replace=False'"
     ]
    }
   ],
   "source": [
    "train_index_1 = np.where(train_data[:, -2] == 0)[0]\n",
    "train_index_1 = np.random.choice(train_index_1, size=7500, replace=False)\n",
    "\n",
    "train_index_2 = np.where(train_data[:, -2] == 1/6)[0]\n",
    "train_index_2 = np.random.choice(train_index_2, size=7500, replace=False)\n",
    "\n",
    "train_index_3 = np.where(train_data[:, -2] == 2/6)[0]\n",
    "train_index_3 = np.random.choice(train_index_3, size=7500, replace=False)\n",
    "\n",
    "train_index_5 = np.where(train_data[:, -2] == 4/6)[0]\n",
    "train_index_5 = np.random.choice(train_index_5, size=7500, replace=False)\n",
    "\n",
    "train_index_6 = np.where(train_data[:, -2] == 5/6)[0]\n",
    "train_index_6 = np.random.choice(train_index_6, size=7500, replace=False)\n",
    "\n",
    "train_index_7 = np.where(train_data[:, -2] == 6/6)[0]\n",
    "train_index_7 = np.random.choice(train_index_7, size=7500, replace=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(45000, 52)"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "balanced_index = np.concatenate((train_index_1, train_index_2, train_index_3, \n",
    "                                    train_index_5, train_index_6, train_index_7))\n",
    "\n",
    "train_balanced_data = train_data[balanced_index]\n",
    "np.save('data/cover_balanced_train31.npy', train_balanced_data)\n",
    "train_balanced_data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(18595, 12011, 21454, 2747, 0, 9741, 2304)"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.sum(test_data[:, -2] == 0/6), \\\n",
    "np.sum(test_data[:, -2] == 1/6), \\\n",
    "np.sum(test_data[:, -2] == 2/6), \\\n",
    "np.sum(test_data[:, -2] == 3/6), \\\n",
    "np.sum(test_data[:, -2] == 4/6), \\\n",
    "np.sum(test_data[:, -2] == 5/6), \\\n",
    "np.sum(test_data[:, -2] == 6/6)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_index_1 = np.where(test_data[:, -2] == 0)[0]\n",
    "test_index_1 = np.random.choice(test_index_1, size=2000, replace=False)\n",
    "\n",
    "test_index_2 = np.where(test_data[:, -2] == 1/6)[0]\n",
    "test_index_2 = np.random.choice(test_index_2, size=2000, replace=False)\n",
    "\n",
    "test_index_3 = np.where(test_data[:, -2] == 2/6)[0]\n",
    "test_index_3 = np.random.choice(test_index_3, size=2000, replace=False)\n",
    "\n",
    "test_index_4 = np.where(test_data[:, -2] == 3/6)[0]\n",
    "test_index_4 = np.random.choice(test_index_4, size=2000, replace=False)\n",
    "\n",
    "test_index_6 = np.where(test_data[:, -2] == 5/6)[0]\n",
    "test_index_6 = np.random.choice(test_index_6, size=2000, replace=False)\n",
    "\n",
    "test_index_7 = np.where(test_data[:, -2] == 6/6)[0]\n",
    "test_index_7 = np.random.choice(test_index_7, size=2000, replace=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(12000, 52)"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "balanced_index = np.concatenate((test_index_1, test_index_2, test_index_3, \n",
    "                                    test_index_4, test_index_6, test_index_7))\n",
    "\n",
    "test_balanced_data = test_data[balanced_index]\n",
    "np.save('data/cover_balanced_test31.npy', test_balanced_data)\n",
    "test_balanced_data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
