{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "fcbf4b6f",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T18:16:54.737592Z",
     "iopub.status.busy": "2024-10-18T18:16:54.737071Z",
     "iopub.status.idle": "2024-10-18T18:16:55.283491Z",
     "shell.execute_reply": "2024-10-18T18:16:55.282718Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Basic Information about the Train Dataset:\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 5923 entries, 0 to 5922\n",
      "Data columns (total 33 columns):\n",
      " #   Column  Non-Null Count  Dtype  \n",
      "---  ------  --------------  -----  \n",
      " 0   X1      5923 non-null   float64\n",
      " 1   X2      5923 non-null   float64\n",
      " 2   X3      5923 non-null   float64\n",
      " 3   X4      5923 non-null   float64\n",
      " 4   X5      5923 non-null   float64\n",
      " 5   X6      5923 non-null   float64\n",
      " 6   X7      5923 non-null   float64\n",
      " 7   X8      5923 non-null   float64\n",
      " 8   X9      5923 non-null   float64\n",
      " 9   X10     5923 non-null   float64\n",
      " 10  X11     5923 non-null   float64\n",
      " 11  X12     5923 non-null   float64\n",
      " 12  X13     5923 non-null   float64\n",
      " 13  X14     5923 non-null   float64\n",
      " 14  X15     5923 non-null   float64\n",
      " 15  X16     5923 non-null   float64\n",
      " 16  X17     5923 non-null   float64\n",
      " 17  X18     5923 non-null   float64\n",
      " 18  X19     5923 non-null   float64\n",
      " 19  X20     5923 non-null   float64\n",
      " 20  X21     5923 non-null   float64\n",
      " 21  X22     5923 non-null   float64\n",
      " 22  X23     5923 non-null   float64\n",
      " 23  X24     5923 non-null   float64\n",
      " 24  X25     5923 non-null   float64\n",
      " 25  X26     5923 non-null   float64\n",
      " 26  X27     5923 non-null   float64\n",
      " 27  X28     5923 non-null   float64\n",
      " 28  X29     5923 non-null   float64\n",
      " 29  X30     5923 non-null   float64\n",
      " 30  X31     5923 non-null   float64\n",
      " 31  X32     5923 non-null   float64\n",
      " 32  Phase   5923 non-null   object \n",
      "dtypes: float64(32), object(1)\n",
      "memory usage: 1.5+ MB\n",
      "None\n",
      "\n",
      "First few rows of the Train Dataset:\n",
      "         X1        X2        X3        X4        X5        X6        X7  \\\n",
      "0 -0.001323  0.000027  0.000231 -0.000244 -0.000293 -0.000021 -0.000770   \n",
      "1  0.010867 -0.019969  0.002434  0.003710 -0.002172 -0.000314  0.006353   \n",
      "2  0.000007 -0.006270 -0.001026 -0.002284 -0.004159  0.000191  0.005918   \n",
      "3 -0.001393  0.001318  0.000247 -0.004119 -0.014195  0.001671 -0.000740   \n",
      "4 -0.000019  0.000301 -0.002830  0.011756 -0.017068 -0.032019 -0.000101   \n",
      "\n",
      "         X8        X9       X10  ...       X24       X25       X26       X27  \\\n",
      "0  0.000009  0.000137 -0.000770  ... -0.000002  0.001343  0.000382  0.000782   \n",
      "1 -0.012381  0.001833  0.002605  ... -0.000069  0.022864  0.004311  0.014036   \n",
      "2 -0.001690 -0.000793 -0.002800  ...  0.000899  0.006354  0.004749  0.006206   \n",
      "3  0.005243  0.000078 -0.001018  ...  0.000079  0.001934  0.014875  0.005295   \n",
      "4  0.000243 -0.002163  0.008662  ...  0.000491  0.002846  0.038141  0.002179   \n",
      "\n",
      "        X28       X29       X30       X31       X32  Phase  \n",
      "0  0.001146  0.000036  0.000006  0.000025  0.000025      S  \n",
      "1  0.002612  0.002608  0.000578  0.002664  0.000379      S  \n",
      "2  0.003078  0.001245  0.003445  0.002328  0.007148      H  \n",
      "3  0.012747  0.001016  0.002180  0.001045  0.001474      P  \n",
      "4  0.028912  0.000147  0.003550  0.000385  0.002483      H  \n",
      "\n",
      "[5 rows x 33 columns]\n",
      "\n",
      "Summary Statistics for Numerical Columns:\n",
      "                X1           X2           X3           X4           X5  \\\n",
      "count  5923.000000  5923.000000  5923.000000  5923.000000  5923.000000   \n",
      "mean     -0.000057    -0.000019    -0.000021     0.000124    -0.000183   \n",
      "std       0.007072     0.010037     0.007119     0.008777     0.012965   \n",
      "min      -0.053743    -0.079209    -0.084849    -0.082809    -0.093253   \n",
      "25%      -0.001754    -0.001634    -0.000422    -0.002099    -0.002933   \n",
      "50%       0.000008     0.000017    -0.000006     0.000014    -0.000045   \n",
      "75%       0.001622     0.002012     0.000353     0.002568     0.002837   \n",
      "max       0.064649     0.071600     0.084053     0.046461     0.103561   \n",
      "\n",
      "                X6           X7           X8           X9          X10  ...  \\\n",
      "count  5923.000000  5923.000000  5923.000000  5923.000000  5923.000000  ...   \n",
      "mean     -0.000092    -0.000035     0.000008     0.000018     0.000112  ...   \n",
      "std       0.007889     0.005971     0.008352     0.006109     0.007053  ...   \n",
      "min      -0.099784    -0.042624    -0.067566    -0.072807    -0.063497  ...   \n",
      "25%      -0.000631    -0.001416    -0.001335    -0.000368    -0.001770  ...   \n",
      "50%      -0.000015     0.000005     0.000010    -0.000005     0.000012  ...   \n",
      "75%       0.000510     0.001444     0.001599     0.000327     0.002124  ...   \n",
      "max       0.072152     0.048213     0.055297     0.064984     0.046357  ...   \n",
      "\n",
      "               X23           X24          X25          X26          X27  \\\n",
      "count  5923.000000  5.923000e+03  5923.000000  5923.000000  5923.000000   \n",
      "mean      0.000019 -3.635982e-06     0.008701     0.011299     0.007212   \n",
      "std       0.001781  9.790609e-04     0.011212     0.013406     0.009523   \n",
      "min      -0.026311 -1.498854e-02     0.000019     0.000011     0.000012   \n",
      "25%      -0.000281 -7.700000e-05     0.001143     0.001701     0.000940   \n",
      "50%       0.000002  7.700000e-07     0.004136     0.006316     0.003398   \n",
      "75%       0.000318  8.306000e-05     0.012113     0.016066     0.009922   \n",
      "max       0.023754  1.053080e-02     0.091338     0.123642     0.077625   \n",
      "\n",
      "               X28           X29           X30           X31           X32  \n",
      "count  5923.000000  5.923000e+03  5.923000e+03  5.923000e+03  5.923000e+03  \n",
      "mean      0.009272  1.489909e-03  1.753659e-03  1.286966e-03  1.481536e-03  \n",
      "std       0.011233  2.356870e-03  2.602380e-03  2.046205e-03  2.137756e-03  \n",
      "min       0.000005  4.500000e-07  2.900000e-07  5.000000e-07  2.500000e-07  \n",
      "25%       0.001330  1.387100e-04  2.082200e-04  1.129300e-04  1.738900e-04  \n",
      "50%       0.005100  5.659100e-04  8.636900e-04  4.780400e-04  6.753400e-04  \n",
      "75%       0.012887  1.924285e-03  2.320180e-03  1.678970e-03  2.002410e-03  \n",
      "max       0.101889  4.262203e-02  4.292456e-02  3.933604e-02  2.974150e-02  \n",
      "\n",
      "[8 rows x 32 columns]\n",
      "\n",
      "Missing Values in the Train Dataset:\n",
      "X1       0\n",
      "X2       0\n",
      "X3       0\n",
      "X4       0\n",
      "X5       0\n",
      "X6       0\n",
      "X7       0\n",
      "X8       0\n",
      "X9       0\n",
      "X10      0\n",
      "X11      0\n",
      "X12      0\n",
      "X13      0\n",
      "X14      0\n",
      "X15      0\n",
      "X16      0\n",
      "X17      0\n",
      "X18      0\n",
      "X19      0\n",
      "X20      0\n",
      "X21      0\n",
      "X22      0\n",
      "X23      0\n",
      "X24      0\n",
      "X25      0\n",
      "X26      0\n",
      "X27      0\n",
      "X28      0\n",
      "X29      0\n",
      "X30      0\n",
      "X31      0\n",
      "X32      0\n",
      "Phase    0\n",
      "dtype: int64\n",
      "\n",
      "Distribution of the Target Column 'Phase':\n",
      "Phase\n",
      "S    1802\n",
      "D    1601\n",
      "P    1245\n",
      "R     656\n",
      "H     619\n",
      "Name: count, dtype: int64\n",
      "\n",
      "Data Types of the Columns:\n",
      "X1       float64\n",
      "X2       float64\n",
      "X3       float64\n",
      "X4       float64\n",
      "X5       float64\n",
      "X6       float64\n",
      "X7       float64\n",
      "X8       float64\n",
      "X9       float64\n",
      "X10      float64\n",
      "X11      float64\n",
      "X12      float64\n",
      "X13      float64\n",
      "X14      float64\n",
      "X15      float64\n",
      "X16      float64\n",
      "X17      float64\n",
      "X18      float64\n",
      "X19      float64\n",
      "X20      float64\n",
      "X21      float64\n",
      "X22      float64\n",
      "X23      float64\n",
      "X24      float64\n",
      "X25      float64\n",
      "X26      float64\n",
      "X27      float64\n",
      "X28      float64\n",
      "X29      float64\n",
      "X30      float64\n",
      "X31      float64\n",
      "X32      float64\n",
      "Phase     object\n",
      "dtype: object\n",
      "\n",
      "Correlation Matrix for Numerical Features:\n",
      "           X1            X2        X3        X4        X5        X6        X7  \\\n",
      "X1   1.000000  1.418533e-02 -0.110387 -0.218683  0.099040 -0.042091  0.852814   \n",
      "X2   0.014185  1.000000e+00  0.078765  0.011315  0.090671  0.037626 -0.007912   \n",
      "X3  -0.110387  7.876490e-02  1.000000  0.052435  0.045104  0.132134 -0.099877   \n",
      "X4  -0.218683  1.131499e-02  0.052435  1.000000 -0.133025  0.010849 -0.233084   \n",
      "X5   0.099040  9.067085e-02  0.045104 -0.133025  1.000000  0.084589  0.112548   \n",
      "X6  -0.042091  3.762595e-02  0.132134  0.010849  0.084589  1.000000 -0.040766   \n",
      "X7   0.852814 -7.912290e-03 -0.099877 -0.233084  0.112548 -0.040766  1.000000   \n",
      "X8   0.018124  9.339035e-01  0.055102  0.008252  0.089860  0.034121 -0.020086   \n",
      "X9  -0.114986  9.703218e-02  0.894442  0.063490  0.046859  0.115759 -0.118955   \n",
      "X10 -0.249226 -5.647097e-03  0.043525  0.858593 -0.124035  0.010274 -0.257837   \n",
      "X11  0.095449  8.389713e-02  0.037709 -0.121087  0.942876  0.086054  0.116532   \n",
      "X12 -0.050428  4.051536e-02  0.105441 -0.005639  0.106898  0.888265 -0.044247   \n",
      "X13  0.334482  5.964243e-02 -0.039454  0.014794 -0.022327 -0.017794  0.198770   \n",
      "X14 -0.039378  2.874981e-01 -0.042892 -0.005997 -0.029872 -0.011667 -0.020441   \n",
      "X15 -0.042819  6.927083e-02  0.256315 -0.015704 -0.005758  0.040177 -0.046262   \n",
      "X16  0.018966  6.972795e-02  0.009793  0.333466 -0.057364  0.011201  0.001288   \n",
      "X17  0.041537 -1.331140e-03  0.001530  0.004765  0.234012 -0.065224  0.053041   \n",
      "X18  0.007845  7.015204e-03  0.030793 -0.007561  0.094038  0.235676  0.000415   \n",
      "X19  0.248137  3.121184e-02 -0.028704  0.000104 -0.012329 -0.009506  0.346321   \n",
      "X20 -0.024626  2.303093e-01 -0.045778 -0.012979 -0.020646 -0.017112 -0.031429   \n",
      "X21 -0.027196  5.441592e-02  0.176932 -0.003247 -0.016350  0.024756 -0.050426   \n",
      "X22 -0.013614  7.713452e-02  0.016928  0.249149 -0.037547  0.009640 -0.030290   \n",
      "X23  0.035978  2.101055e-03  0.001336  0.002798  0.174871 -0.059971  0.051684   \n",
      "X24  0.001346  4.147977e-03  0.017842 -0.009897  0.083042  0.154395 -0.008406   \n",
      "X25  0.019793 -7.412326e-02 -0.022828  0.001814 -0.003066  0.019475  0.024002   \n",
      "X26  0.028113  9.906352e-04  0.017928 -0.062283 -0.033680 -0.044831  0.028703   \n",
      "X27  0.016578 -7.771499e-02 -0.012089 -0.003905 -0.003309  0.001212  0.023594   \n",
      "X28  0.006939  1.854078e-03  0.016973 -0.044170 -0.040866 -0.041687  0.013272   \n",
      "X29  0.016437 -7.443590e-02 -0.031562  0.004786  0.000383 -0.018108  0.013581   \n",
      "X30  0.021711 -2.985190e-03 -0.004918 -0.059736  0.004512 -0.009427  0.007502   \n",
      "X31  0.025300 -7.789494e-02 -0.019057 -0.008543  0.015478  0.000016  0.027662   \n",
      "X32  0.011421  2.539922e-07  0.005450 -0.038455 -0.022720 -0.005403  0.004004   \n",
      "\n",
      "           X8        X9       X10  ...       X23       X24       X25  \\\n",
      "X1   0.018124 -0.114986 -0.249226  ...  0.035978  0.001346  0.019793   \n",
      "X2   0.933903  0.097032 -0.005647  ...  0.002101  0.004148 -0.074123   \n",
      "X3   0.055102  0.894442  0.043525  ...  0.001336  0.017842 -0.022828   \n",
      "X4   0.008252  0.063490  0.858593  ...  0.002798 -0.009897  0.001814   \n",
      "X5   0.089860  0.046859 -0.124035  ...  0.174871  0.083042 -0.003066   \n",
      "X6   0.034121  0.115759  0.010274  ... -0.059971  0.154395  0.019475   \n",
      "X7  -0.020086 -0.118955 -0.257837  ...  0.051684 -0.008406  0.024002   \n",
      "X8   1.000000  0.085182 -0.007311  ...  0.009949  0.008432 -0.078255   \n",
      "X9   0.085182  1.000000  0.046354  ...  0.005055  0.016338 -0.002167   \n",
      "X10 -0.007311  0.046354  1.000000  ...  0.023676  0.005629  0.010354   \n",
      "X11  0.085817  0.040374 -0.104225  ...  0.245286  0.101295 -0.009316   \n",
      "X12  0.044820  0.104845  0.009365  ... -0.062064  0.256070  0.041076   \n",
      "X13  0.057190 -0.039991 -0.000734  ...  0.020963 -0.018029 -0.008723   \n",
      "X14  0.217970 -0.049349 -0.028622  ... -0.054705 -0.020109 -0.029211   \n",
      "X15  0.053778  0.220324 -0.021119  ...  0.024061  0.058251 -0.049832   \n",
      "X16  0.064655  0.009951  0.216389  ... -0.144440 -0.023726 -0.015995   \n",
      "X17  0.007723  0.005759  0.009677  ...  0.745730 -0.004337  0.001636   \n",
      "X18  0.008197  0.024789 -0.002213  ... -0.022019  0.596879 -0.024181   \n",
      "X19  0.024720 -0.042778 -0.018897  ...  0.044834 -0.020299  0.005234   \n",
      "X20  0.293969 -0.046036 -0.031164  ... -0.047698 -0.015479 -0.025401   \n",
      "X21  0.043149  0.261878 -0.013653  ...  0.016649  0.057068 -0.048441   \n",
      "X22  0.068873  0.010982  0.351510  ... -0.077713  0.034545 -0.016186   \n",
      "X23  0.009949  0.005055  0.023676  ...  1.000000  0.039540 -0.004261   \n",
      "X24  0.008432  0.016338  0.005629  ...  0.039540  1.000000 -0.011601   \n",
      "X25 -0.078255 -0.002167  0.010354  ... -0.004261 -0.011601  1.000000   \n",
      "X26  0.002923  0.015615 -0.061181  ...  0.017553 -0.073143  0.439170   \n",
      "X27 -0.078583  0.020004  0.003780  ... -0.019309 -0.017949  0.917523   \n",
      "X28  0.005174  0.021807 -0.044479  ...  0.000266 -0.068998  0.454305   \n",
      "X29 -0.059555 -0.000149  0.014756  ... -0.005332 -0.010195  0.541359   \n",
      "X30  0.000869 -0.005414 -0.059487  ... -0.010860 -0.034998  0.264307   \n",
      "X31 -0.073032  0.007927 -0.005598  ... -0.029753 -0.007285  0.483211   \n",
      "X32 -0.000221  0.009960 -0.037337  ... -0.016690 -0.063019  0.305047   \n",
      "\n",
      "          X26       X27       X28       X29       X30       X31           X32  \n",
      "X1   0.028113  0.016578  0.006939  0.016437  0.021711  0.025300  1.142134e-02  \n",
      "X2   0.000991 -0.077715  0.001854 -0.074436 -0.002985 -0.077895  2.539922e-07  \n",
      "X3   0.017928 -0.012089  0.016973 -0.031562 -0.004918 -0.019057  5.450180e-03  \n",
      "X4  -0.062283 -0.003905 -0.044170  0.004786 -0.059736 -0.008543 -3.845455e-02  \n",
      "X5  -0.033680 -0.003309 -0.040866  0.000383  0.004512  0.015478 -2.272009e-02  \n",
      "X6  -0.044831  0.001212 -0.041687 -0.018108 -0.009427  0.000016 -5.402501e-03  \n",
      "X7   0.028703  0.023594  0.013272  0.013581  0.007502  0.027662  4.003684e-03  \n",
      "X8   0.002923 -0.078583  0.005174 -0.059555  0.000869 -0.073032 -2.210877e-04  \n",
      "X9   0.015615  0.020004  0.021807 -0.000149 -0.005414  0.007927  9.960431e-03  \n",
      "X10 -0.061181  0.003780 -0.044479  0.014756 -0.059487 -0.005598 -3.733664e-02  \n",
      "X11 -0.034049 -0.008719 -0.044240 -0.002342  0.004892  0.010681 -1.866988e-02  \n",
      "X12 -0.042322  0.031798 -0.031495 -0.000882  0.004728  0.008719  2.413615e-03  \n",
      "X13  0.000538  0.010019 -0.005908 -0.012221 -0.004193 -0.012232 -4.917143e-04  \n",
      "X14 -0.009568 -0.036395 -0.013294 -0.021337  0.056959 -0.006580  4.754051e-02  \n",
      "X15 -0.013004 -0.044811 -0.002884 -0.052490 -0.065533 -0.037580 -4.271067e-02  \n",
      "X16 -0.013260 -0.017555 -0.014007  0.034286 -0.005988  0.034090 -5.889682e-03  \n",
      "X17  0.022829 -0.013368 -0.001988  0.002686  0.009230 -0.022252 -1.920744e-02  \n",
      "X18 -0.109285 -0.033295 -0.098663  0.004590 -0.045002  0.031876 -3.111636e-02  \n",
      "X19  0.010928  0.019540  0.012377  0.010264 -0.007827  0.025182  8.385711e-03  \n",
      "X20 -0.007212 -0.033970 -0.010823 -0.006154  0.062251 -0.003090  4.563187e-02  \n",
      "X21 -0.007433 -0.054686 -0.002905 -0.054056 -0.071296 -0.089797 -4.120516e-02  \n",
      "X22 -0.013610 -0.014137 -0.011728  0.016158 -0.005818  0.006115 -1.705398e-02  \n",
      "X23  0.017553 -0.019309  0.000266 -0.005332 -0.010860 -0.029753 -1.669041e-02  \n",
      "X24 -0.073143 -0.017949 -0.068998 -0.010195 -0.034998 -0.007285 -6.301920e-02  \n",
      "X25  0.439170  0.917523  0.454305  0.541359  0.264307  0.483211  3.050470e-01  \n",
      "X26  1.000000  0.434090  0.923940  0.272172  0.481249  0.263257  4.883993e-01  \n",
      "X27  0.434090  1.000000  0.461776  0.504465  0.267097  0.536411  3.077021e-01  \n",
      "X28  0.923940  0.461776  1.000000  0.276355  0.443227  0.276350  5.321633e-01  \n",
      "X29  0.272172  0.504465  0.276355  1.000000  0.341332  0.803261  3.510806e-01  \n",
      "X30  0.481249  0.267097  0.443227  0.341332  1.000000  0.331696  7.711593e-01  \n",
      "X31  0.263257  0.536411  0.276350  0.803261  0.331696  1.000000  3.486635e-01  \n",
      "X32  0.488399  0.307702  0.532163  0.351081  0.771159  0.348664  1.000000e+00  \n",
      "\n",
      "[32 rows x 32 columns]\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "# Load the train dataset\n",
    "train_df = pd.read_csv('/data/datasets/GesturePhaseSegmentationProcessed/split_train.csv')\n",
    "\n",
    "# Display basic information about the dataset\n",
    "print(\"Basic Information about the Train Dataset:\")\n",
    "print(train_df.info())\n",
    "\n",
    "# Display the first few rows of the dataset\n",
    "print(\"\\nFirst few rows of the Train Dataset:\")\n",
    "print(train_df.head())\n",
    "\n",
    "# Summary statistics for numerical columns\n",
    "print(\"\\nSummary Statistics for Numerical Columns:\")\n",
    "print(train_df.describe())\n",
    "\n",
    "# Check for missing values\n",
    "print(\"\\nMissing Values in the Train Dataset:\")\n",
    "print(train_df.isnull().sum())\n",
    "\n",
    "# Check the distribution of the target column 'Phase'\n",
    "print(\"\\nDistribution of the Target Column 'Phase':\")\n",
    "print(train_df['Phase'].value_counts())\n",
    "\n",
    "# Check the data types of the columns\n",
    "print(\"\\nData Types of the Columns:\")\n",
    "print(train_df.dtypes)\n",
    "\n",
    "# Analyze the correlation between numerical features\n",
    "# Select only numeric columns for correlation analysis\n",
    "numeric_columns = train_df.select_dtypes(include=[np.number])\n",
    "print(\"\\nCorrelation Matrix for Numerical Features:\")\n",
    "print(numeric_columns.corr())\n",
    "\n",
    "# Additional EDA steps can be added here as needed\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "4f002593",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T18:16:55.310697Z",
     "iopub.status.busy": "2024-10-18T18:16:55.310378Z",
     "iopub.status.idle": "2024-10-18T18:16:55.817857Z",
     "shell.execute_reply": "2024-10-18T18:16:55.817008Z"
    }
   },
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.impute import SimpleImputer\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "# Function to preprocess data\n",
    "def preprocess_data(train_df, dev_df, test_df):\n",
    "    # Copy the dataframes to avoid modifying the original ones\n",
    "    train_df_copy = train_df.copy()\n",
    "    dev_df_copy = dev_df.copy()\n",
    "    test_df_copy = test_df.copy()\n",
    "    \n",
    "    # Separate the target column\n",
    "    X_train = train_df_copy.drop(columns=['Phase'])\n",
    "    y_train = train_df_copy['Phase']\n",
    "    X_dev = dev_df_copy.drop(columns=['Phase'])\n",
    "    y_dev = dev_df_copy['Phase']\n",
    "    X_test = test_df_copy\n",
    "    \n",
    "    # Identify numeric columns\n",
    "    numeric_features = X_train.select_dtypes(include=[np.number]).columns\n",
    "    \n",
    "    # Preprocessing for numerical data: imputation and scaling\n",
    "    numeric_transformer = Pipeline(steps=[\n",
    "        ('imputer', SimpleImputer(strategy='mean')),\n",
    "        ('scaler', StandardScaler())\n",
    "    ])\n",
    "    \n",
    "    # Combine preprocessing steps\n",
    "    preprocessor = ColumnTransformer(\n",
    "        transformers=[\n",
    "            ('num', numeric_transformer, numeric_features)\n",
    "        ])\n",
    "    \n",
    "    # Apply the preprocessing to the datasets\n",
    "    X_train_preprocessed = preprocessor.fit_transform(X_train)\n",
    "    X_dev_preprocessed = preprocessor.transform(X_dev)\n",
    "    X_test_preprocessed = preprocessor.transform(X_test)\n",
    "    \n",
    "    # Encode the target labels\n",
    "    label_encoder = LabelEncoder()\n",
    "    y_train_encoded = label_encoder.fit_transform(y_train)\n",
    "    y_dev_encoded = label_encoder.transform(y_dev)\n",
    "    \n",
    "    return X_train_preprocessed, y_train_encoded, X_dev_preprocessed, y_dev_encoded, X_test_preprocessed\n",
    "\n",
    "# Load the dev and test datasets\n",
    "dev_df = pd.read_csv('/data/datasets/GesturePhaseSegmentationProcessed/split_dev.csv')\n",
    "test_df = pd.read_csv('/data/datasets/GesturePhaseSegmentationProcessed/split_test_wo_target.csv')\n",
    "\n",
    "# Preprocess the data\n",
    "X_train_preprocessed, y_train_encoded, X_dev_preprocessed, y_dev_encoded, X_test_preprocessed = preprocess_data(train_df, dev_df, test_df)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "977c0439",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T18:16:55.834541Z",
     "iopub.status.busy": "2024-10-18T18:16:55.834318Z",
     "iopub.status.idle": "2024-10-18T18:16:55.871266Z",
     "shell.execute_reply": "2024-10-18T18:16:55.870501Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of highly correlated features removed: 4\n",
      "Features removed: {'X11', 'X8', 'X28', 'X27'}\n"
     ]
    }
   ],
   "source": [
    "# Perform correlation analysis to identify highly correlated features\n",
    "# Exclude the target column 'Phase' from the correlation analysis\n",
    "numeric_columns = train_df.select_dtypes(include=[np.number]).columns\n",
    "correlation_matrix = train_df[numeric_columns].corr()\n",
    "\n",
    "# Set a threshold for correlation\n",
    "correlation_threshold = 0.9\n",
    "\n",
    "# Identify pairs of features with correlation above the threshold\n",
    "highly_correlated_features = set()\n",
    "for i in range(len(correlation_matrix.columns)):\n",
    "    for j in range(i):\n",
    "        if abs(correlation_matrix.iloc[i, j]) > correlation_threshold:\n",
    "            colname = correlation_matrix.columns[i]\n",
    "            highly_correlated_features.add(colname)\n",
    "\n",
    "# Remove highly correlated features from the datasets\n",
    "def remove_highly_correlated_features(df, highly_correlated_features):\n",
    "    df_copy = df.copy()\n",
    "    df_copy = df_copy.drop(columns=highly_correlated_features)\n",
    "    return df_copy\n",
    "\n",
    "train_df_cleaned = remove_highly_correlated_features(train_df, highly_correlated_features)\n",
    "dev_df_cleaned = remove_highly_correlated_features(dev_df, highly_correlated_features)\n",
    "test_df_cleaned = remove_highly_correlated_features(test_df, highly_correlated_features)\n",
    "\n",
    "# Print the number of features removed\n",
    "print(f\"Number of highly correlated features removed: {len(highly_correlated_features)}\")\n",
    "print(f\"Features removed: {highly_correlated_features}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e7784c1f",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T18:17:05.716415Z",
     "iopub.status.busy": "2024-10-18T18:17:05.715562Z",
     "iopub.status.idle": "2024-10-18T18:17:06.759767Z",
     "shell.execute_reply": "2024-10-18T18:17:06.758936Z"
    }
   },
   "outputs": [],
   "source": [
    "from metagpt.tools.libs.data_preprocess import get_column_info\n",
    "\n",
    "# Print column information for the cleaned train dataset\n",
    "column_info = get_column_info(train_df_cleaned)\n",
    "print(\"Column Information for Cleaned Train Dataset:\")\n",
    "print(column_info)\n",
    "\n",
    "# Print column information for the cleaned dev dataset\n",
    "column_info = get_column_info(dev_df_cleaned)\n",
    "print(\"\\nColumn Information for Cleaned Dev Dataset:\")\n",
    "print(column_info)\n",
    "\n",
    "# Print column information for the cleaned test dataset\n",
    "column_info = get_column_info(test_df_cleaned)\n",
    "print(\"\\nColumn Information for Cleaned Test Dataset:\")\n",
    "print(column_info)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "373160eb",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T18:19:00.189355Z",
     "iopub.status.busy": "2024-10-18T18:19:00.188700Z"
    }
   },
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.metrics import f1_score\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.ensemble import StackingClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from xgboost import XGBClassifier\n",
    "\n",
    "# Define base models for stacking\n",
    "base_models = [\n",
    "    ('rf', RandomForestClassifier(random_state=42)),\n",
    "    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42))\n",
    "]\n",
    "\n",
    "# Define the stacking model\n",
    "stacking_model = StackingClassifier(\n",
    "    estimators=base_models,\n",
    "    final_estimator=LogisticRegression(random_state=42)\n",
    ")\n",
    "\n",
    "# Hyperparameter grid for RandomForest\n",
    "rf_param_grid = {\n",
    "    'rf__n_estimators': [100, 200],\n",
    "    'rf__max_depth': [None, 10, 20],\n",
    "    'rf__min_samples_split': [2, 5],\n",
    "    'rf__min_samples_leaf': [1, 2]\n",
    "}\n",
    "\n",
    "# Hyperparameter grid for XGBoost\n",
    "xgb_param_grid = {\n",
    "    'xgb__n_estimators': [100, 200],\n",
    "    'xgb__max_depth': [3, 5, 7],\n",
    "    'xgb__learning_rate': [0.01, 0.1],\n",
    "    'xgb__subsample': [0.8, 1.0],\n",
    "    'xgb__colsample_bytree': [0.8, 1.0]\n",
    "}\n",
    "\n",
    "# Combine all hyperparameter grids\n",
    "param_grid = {**rf_param_grid, **xgb_param_grid}\n",
    "\n",
    "# GridSearchCV for hyperparameter tuning\n",
    "grid_search = GridSearchCV(estimator=stacking_model, param_grid=param_grid, cv=3, scoring='f1_weighted', verbose=2, n_jobs=-1)\n",
    "grid_search.fit(X_train_preprocessed, y_train_encoded)\n",
    "\n",
    "# Best model from GridSearchCV\n",
    "best_model = grid_search.best_estimator_\n",
    "\n",
    "# Predictions on train and dev sets\n",
    "y_train_pred = best_model.predict(X_train_preprocessed)\n",
    "y_dev_pred = best_model.predict(X_dev_preprocessed)\n",
    "\n",
    "# Convert predictions back to original labels\n",
    "y_train_pred_labels = label_encoder.inverse_transform(y_train_pred)\n",
    "y_dev_pred_labels = label_encoder.inverse_transform(y_dev_pred)\n",
    "\n",
    "# F1 weighted score on train and dev sets\n",
    "train_f1_weighted = f1_score(train_df_cleaned['Phase'], y_train_pred_labels, average='weighted')\n",
    "dev_f1_weighted = f1_score(dev_df_cleaned['Phase'], y_dev_pred_labels, average='weighted')\n",
    "\n",
    "print(f\"Train F1 Weighted Score: {train_f1_weighted}\")\n",
    "print(f\"Dev F1 Weighted Score: {dev_f1_weighted}\")\n",
    "\n",
    "# Predictions on test set\n",
    "y_test_pred = best_model.predict(X_test_preprocessed)\n",
    "y_test_pred_labels = label_encoder.inverse_transform(y_test_pred)\n",
    "\n",
    "# Save predictions to CSV files\n",
    "import os\n",
    "output_dir = '../workspace/GesturePhaseSegmentationProcessed'\n",
    "os.makedirs(output_dir, exist_ok=True)\n",
    "pd.DataFrame({'target': y_dev_pred_labels}).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\n",
    "pd.DataFrame({'target': y_test_pred_labels}).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0f915b9d",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-10-18T18:54:05.701122Z",
     "iopub.status.busy": "2024-10-18T18:54:05.700488Z",
     "iopub.status.idle": "2024-10-18T18:59:39.369787Z",
     "shell.execute_reply": "2024-10-18T18:59:39.368965Z"
    }
   },
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.metrics import f1_score\n",
    "from sklearn.model_selection import RandomizedSearchCV\n",
    "from sklearn.ensemble import StackingClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from xgboost import XGBClassifier\n",
    "from sklearn.preprocessing import StandardScaler, LabelEncoder\n",
    "from sklearn.impute import SimpleImputer\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.compose import ColumnTransformer\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "\n",
    "# Load the datasets\n",
    "train_df = pd.read_csv('/data/datasets/GesturePhaseSegmentationProcessed/split_train.csv')\n",
    "dev_df = pd.read_csv('/data/datasets/GesturePhaseSegmentationProcessed/split_dev.csv')\n",
    "test_df = pd.read_csv('/data/datasets/GesturePhaseSegmentationProcessed/split_test_wo_target.csv')\n",
    "\n",
    "# Define the preprocess_data function\n",
    "def preprocess_data(train_df, dev_df, test_df):\n",
    "    train_df_copy = train_df.copy()\n",
    "    dev_df_copy = dev_df.copy()\n",
    "    test_df_copy = test_df.copy()\n",
    "    \n",
    "    X_train = train_df_copy.drop(columns=['Phase'])\n",
    "    y_train = train_df_copy['Phase']\n",
    "    X_dev = dev_df_copy.drop(columns=['Phase'])\n",
    "    y_dev = dev_df_copy['Phase']\n",
    "    X_test = test_df_copy\n",
    "    \n",
    "    numeric_features = X_train.select_dtypes(include=[np.number]).columns\n",
    "    numeric_transformer = Pipeline(steps=[\n",
    "        ('imputer', SimpleImputer(strategy='mean')),\n",
    "        ('scaler', StandardScaler())\n",
    "    ])\n",
    "    \n",
    "    preprocessor = ColumnTransformer(\n",
    "        transformers=[\n",
    "            ('num', numeric_transformer, numeric_features)\n",
    "        ])\n",
    "    \n",
    "    X_train_preprocessed = preprocessor.fit_transform(X_train)\n",
    "    X_dev_preprocessed = preprocessor.transform(X_dev)\n",
    "    X_test_preprocessed = preprocessor.transform(X_test)\n",
    "    \n",
    "    label_encoder = LabelEncoder()\n",
    "    y_train_encoded = label_encoder.fit_transform(y_train)\n",
    "    y_dev_encoded = label_encoder.transform(y_dev)\n",
    "    \n",
    "    return X_train_preprocessed, y_train_encoded, X_dev_preprocessed, y_dev_encoded, X_test_preprocessed, label_encoder\n",
    "\n",
    "# Define base models for stacking\n",
    "base_models = [\n",
    "    ('rf', RandomForestClassifier(random_state=42)),\n",
    "    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42))\n",
    "]\n",
    "\n",
    "# Define the stacking model\n",
    "stacking_model = StackingClassifier(\n",
    "    estimators=base_models,\n",
    "    final_estimator=LogisticRegression(random_state=42)\n",
    ")\n",
    "\n",
    "# Hyperparameter grid for RandomForest\n",
    "rf_param_grid = {\n",
    "    'rf__n_estimators': [100, 200],\n",
    "    'rf__max_depth': [None, 10, 20],\n",
    "    'rf__min_samples_split': [2, 5],\n",
    "    'rf__min_samples_leaf': [1, 2]\n",
    "}\n",
    "\n",
    "# Hyperparameter grid for XGBoost\n",
    "xgb_param_grid = {\n",
    "    'xgb__n_estimators': [100, 200],\n",
    "    'xgb__max_depth': [3, 5, 7],\n",
    "    'xgb__learning_rate': [0.01, 0.1],\n",
    "    'xgb__subsample': [0.8, 1.0],\n",
    "    'xgb__colsample_bytree': [0.8, 1.0]\n",
    "}\n",
    "\n",
    "# Combine all hyperparameter grids\n",
    "param_grid = {**rf_param_grid, **xgb_param_grid}\n",
    "\n",
    "# RandomizedSearchCV for hyperparameter tuning\n",
    "random_search = RandomizedSearchCV(estimator=stacking_model, param_distributions=param_grid, n_iter=50, cv=3, scoring='f1_weighted', verbose=2, n_jobs=-1, random_state=42)\n",
    "\n",
    "# Ensure preprocessing is done before model fitting\n",
    "try:\n",
    "    X_train_preprocessed, y_train_encoded, X_dev_preprocessed, y_dev_encoded, X_test_preprocessed, label_encoder = preprocess_data(train_df, dev_df, test_df)\n",
    "except Exception as e:\n",
    "    print(f\"Error during preprocessing: {e}\")\n",
    "    raise\n",
    "\n",
    "try:\n",
    "    random_search.fit(X_train_preprocessed, y_train_encoded)\n",
    "except Exception as e:\n",
    "    print(f\"Error during model fitting: {e}\")\n",
    "    raise\n",
    "\n",
    "# Best model from RandomizedSearchCV\n",
    "best_model = random_search.best_estimator_\n",
    "\n",
    "# Predictions on train and dev sets\n",
    "try:\n",
    "    y_train_pred = best_model.predict(X_train_preprocessed)\n",
    "    y_dev_pred = best_model.predict(X_dev_preprocessed)\n",
    "except Exception as e:\n",
    "    print(f\"Error during prediction: {e}\")\n",
    "    raise\n",
    "\n",
    "# Convert predictions back to original labels\n",
    "y_train_pred_labels = label_encoder.inverse_transform(y_train_pred)\n",
    "y_dev_pred_labels = label_encoder.inverse_transform(y_dev_pred)\n",
    "\n",
    "# F1 weighted score on train and dev sets\n",
    "train_f1_weighted = f1_score(train_df['Phase'], y_train_pred_labels, average='weighted')\n",
    "dev_f1_weighted = f1_score(dev_df['Phase'], y_dev_pred_labels, average='weighted')\n",
    "\n",
    "print(f\"Train F1 Weighted Score: {train_f1_weighted}\")\n",
    "print(f\"Dev F1 Weighted Score: {dev_f1_weighted}\")\n",
    "\n",
    "# Predictions on test set\n",
    "try:\n",
    "    y_test_pred = best_model.predict(X_test_preprocessed)\n",
    "except Exception as e:\n",
    "    print(f\"Error during test prediction: {e}\")\n",
    "    raise\n",
    "\n",
    "y_test_pred_labels = label_encoder.inverse_transform(y_test_pred)\n",
    "\n",
    "# Save predictions to CSV files\n",
    "output_dir = '../workspace/GesturePhaseSegmentationProcessed'\n",
    "os.makedirs(output_dir, exist_ok=True)\n",
    "\n",
    "try:\n",
    "    pd.DataFrame({'target': y_dev_pred_labels}).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\n",
    "    pd.DataFrame({'target': y_test_pred_labels}).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n",
    "except Exception as e:\n",
    "    print(f\"Error saving predictions: {e}\")\n",
    "    raise\n"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
