{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "from matplotlib import pyplot as plt\n",
    "import glob\n",
    "import numpy as np\n",
    "from tqdm import tqdm\n",
    "from datetime import datetime, timezone, timedelta\n",
    "import math"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def haversine(lat1, lon1, lat2, lon2):\n",
    "    R = 6371.0  # Radius of the Earth in kilometers\n",
    "\n",
    "    dlat = math.radians(lat2 - lat1)\n",
    "    dlon = math.radians(lon2 - lon1)\n",
    "\n",
    "    a = (math.sin(dlat / 2) ** 2 +\n",
    "         math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) *\n",
    "         math.sin(dlon / 2) ** 2)\n",
    "    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))\n",
    "\n",
    "    distance = R * c\n",
    "    return distance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 536/536 [05:18<00:00,  1.68it/s]\n"
     ]
    }
   ],
   "source": [
    "data_path = './../data_cabspotting/'\n",
    "\n",
    "file_paths = glob.glob(data_path + './data_raw/new_*.txt')\n",
    "\n",
    "dfs = []\n",
    "for file_path in tqdm(file_paths):\n",
    "    if os.path.exists(file_path):\n",
    "        df_temp = pd.read_table(file_path, sep=\" \", header=None)\n",
    "        \n",
    "        df_temp[3] = df_temp[3].apply(lambda x: datetime.fromtimestamp(x, tz=timezone.utc).astimezone(timezone(timedelta(hours=-8))))\n",
    "\n",
    "        df_temp = df_temp.sort_values(by=3)\n",
    "        df_temp['prev_latitude'] = df_temp[0].shift(1)\n",
    "        df_temp['prev_longitude'] = df_temp[1].shift(1)\n",
    "\n",
    "        df_temp['distance_km'] = df_temp.apply(lambda row: haversine(row[0], row[1], \n",
    "                                                        row['prev_latitude'], row['prev_longitude']), axis=1)\n",
    "        df_temp['prev_time'] = df_temp[3].shift(1)\n",
    "        df_temp['time_diff'] = (df_temp[3] - df_temp['prev_time']).dt.seconds\n",
    "\n",
    "        df_temp['flag_transition_time'] = np.where((df_temp['time_diff']>100) | (df_temp['distance_km']>2.5), 1, 0)\n",
    "        df_temp['flag_transition_bool'] = df_temp[2].diff().abs()\n",
    "\n",
    "        df_temp['flag_transition'] = np.where(df_temp['flag_transition_time'] + df_temp['flag_transition_bool'] > 0.5, 1, 0)\n",
    "        df_temp['trip_id'] = (df_temp['flag_transition'] == 1).cumsum()\n",
    "\n",
    "        df_temp = df_temp.drop(\n",
    "            columns=['prev_latitude', 'prev_longitude', \n",
    "                     'prev_time', 'flag_transition_time', \n",
    "                     'flag_transition_bool', 'flag_transition'])\n",
    "\n",
    "        df_temp.fillna(0, inplace=True)\n",
    "        \n",
    "        file_name = os.path.basename(file_path).replace('new_', '').replace('.txt', '')\n",
    "        df_temp['driver'] = file_name\n",
    "        \n",
    "        output_path = os.path.join(data_path, f\"processed_{file_name}.csv\")\n",
    "        df_temp.to_csv(output_path, index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
