from constants import *
import pandas as pd
import numpy as np
from pathlib import Path

def get_df(year, month):
    num_files = None
    dir = "../../Downloads/Citibike_dataset"

    for i in range(10):
        if not Path(f"{dir}/{year}{month}-citibike-tripdata/{year}{month}-citibike-tripdata_{i + 1}.csv").exists():
            num_files = i
            break

    df = [pd.read_csv(f"{dir}/{year}{month}-citibike-tripdata/{year}{month}-citibike-tripdata_{i + 1}.csv",
                      low_memory=False) for i in range(num_files)]
    df = pd.concat(df)

    # index by the starting time
    df = df.set_index("started_at")
    df.index = pd.to_datetime(df.index)

    # filter out rides where the starting station is not known
    df = df[df[['start_lat', 'start_station_name', 'start_station_id']].notna().any(axis=1)]

    # filter out rides starting outside the location of interest
    not_in_ny = (df['start_lat'] < minLat) | (df['start_lat'] > maxLat) | (df['start_lng'] < minLng) | (
                df['start_lng'] > maxLng)

    df = df[~not_in_ny]

    return df


def get_requests(df):
    df2 = df.copy()

    # whether to partition based on longitude or latitude
    if USE_LONG:
        df2['requests'] = df2['start_lng'].apply(lambda x: int(int((x - minLng) / (maxLng - minLng + 0.000000001) * n) / n * n))
    else:
        df2['requests'] = df2['start_lat'].apply(lambda x: int(int((x - minLat) / (maxLat - minLat + 0.000000001) * n) / n * n))

    df2['casual'] = (df2['member_casual'] == 'casual')
    return df2[['requests', 'casual']]

def prepare_dataset(years):
    df_raw = pd.concat(
         [get_df(year, month) for year in years for month in ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]])

    print("Data imported!")
    print("Number of rides: ", len(df_raw))
    df = get_requests(df_raw)
    print("Got requests!")
    df = df.sort_index()
    df.to_csv("Requests2023-2025_LatMan.csv")

prepare_dataset(["2023", "2024", "2025"])