import gc
import os
import psutil

import pandas as pd  # For data manipulation
import numpy as np  # For numerical operations
import warnings  # To suppress warnings
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
from autogluon.timeseries.utils.forecast import get_forecast_horizon_index_ts_dataframe
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings('ignore')  # Suppress warnings for cleaner output

import random  # For generating random numbers

# Function to set a fixed random seed for reproducibility
def seed_everything(seed):
    np.random.seed(seed)  # Set numpy random seed
    random.seed(seed)  # Set built-in random seed

seed_everything(seed=2024)  # Set the seed to 2024


calendar = pd.read_csv("/home/magics/hdd/sky_ws/hopformer_ws/datasets/kaggle/m5/calendar.csv")  # Load calendar dataset
print(f"len(calendar):{len(calendar)}")  # Print the number of rows in calendar

sales_train_evaluation = pd.read_csv("/home/magics/hdd/sky_ws/hopformer_ws/datasets/kaggle/m5/sales_train_evaluation.csv")
print(f"len(sales_train_evaluation): {len(sales_train_evaluation)}")

sell_prices = pd.read_csv("/home/magics/hdd/sky_ws/hopformer_ws/datasets/kaggle/m5/sell_prices.csv")
print(f"len(sell_prices):{len(sell_prices)}")

# Enhanced memory optimization function with object datatype handling
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2  # Initial memory usage in MB
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:  # Downcast numerics
            c_min, c_max = df[col].min(), df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
        elif col_type == 'object':  # Handle object types
            if col == 'date':  # Convert date column to datetime
                df[col] = pd.to_datetime(df[col], format='%Y-%m-%d')
            else:
                df[col] = df[col].astype('category')  # Convert other object types to category
    end_mem = df.memory_usage().sum() / 1024**2  # Final memory usage in MB
    if verbose:
        print(f'Memory usage reduced to {end_mem:5.2f} Mb ({100 * (start_mem - end_mem) / start_mem:.1f}% reduction)')
    return df

# Apply the optimized memory reduction function to each dataframe
calendar = reduce_mem_usage(calendar)
sell_prices = reduce_mem_usage(sell_prices)
sales_train_evaluation = reduce_mem_usage(sales_train_evaluation)

# Specify day columns up to d_1941 for the extended dataset
d_cols_eval = [f"d_{i}" for i in range(1, 1942)]
sales_train_evaluation_long = sales_train_evaluation.melt(
    id_vars=["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"],
    value_vars=d_cols_eval,
    var_name="d",
    value_name="sales"
)
print(f"len(sales_train_evaluation_long): {len(sales_train_evaluation_long)}")

# Encode event-related features in the calendar dataframe
event_columns = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
le = LabelEncoder()
for col in event_columns:
    calendar[col] = le.fit_transform(calendar[col].astype(str))

sales_train_evaluation_long = sales_train_evaluation_long.merge(calendar, on="d", how="left")

sales_train_evaluation_long = sales_train_evaluation_long.merge(
    sell_prices, 
    on=["store_id", "item_id", "wm_yr_wk"], 
    how="left"
)

train_set = sales_train_evaluation_long[sales_train_evaluation_long['d'].isin([f'd_{i}' for i in range(1500, 1914)])]
print("Train set sample:")
print(f"len(Train set): {len(train_set)}")

# Filter the validation set for d_1914 to d_1941
validation_set = sales_train_evaluation_long[sales_train_evaluation_long['d'].isin([f'd_{i}' for i in range(1914, 1942)])]
print("Validation set sample:")
print(f"len(Validation set): {len(validation_set)}")

# Create a new DataFrame for the prediction period (d_1942 to d_1969)
forecast_days = [f'd_{i}' for i in range(1942, 1970)]
forecast_df = pd.DataFrame({'d': forecast_days})

# Generate one entry per product-store combination for each forecast day
prediction_set = sales_train_evaluation[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']].drop_duplicates()
prediction_set = prediction_set.merge(forecast_df, how='cross')

# Merge with calendar data to add date information for the forecast period
prediction_set = prediction_set.merge(calendar, on='d', how='left')

# Merge with sell_prices to add price data, aligning with the correct store, item, and week
prediction_set = prediction_set.merge(sell_prices, on=['store_id', 'item_id', 'wm_yr_wk'], how='left')

print("Prediction set sample:")
print(f"len(Prediction set): {len(prediction_set)}")

# Apply the optimized memory reduction function to each dataframe
train_set = reduce_mem_usage(train_set)
validation_set = reduce_mem_usage(validation_set)
prediction_set = reduce_mem_usage(prediction_set)


# Step 1: Convert the 'date' column to datetime format
train_set['date'] = pd.to_datetime(train_set['date'])

# Step 2: Select only the relevant columns for AutoGluon, including target and covariates
covariate_columns = ['dept_id', 'store_id','sales', 'wday', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 
                     'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
train_set_for_autogluon = train_set[['id', 'date'] + covariate_columns].copy()

# Step 3: Sort the DataFrame by 'id' and 'date' to ensure correct order
train_set_for_autogluon = train_set_for_autogluon.sort_values(by=['id', 'date'])

# Step 4: Convert to AutoGluon TimeSeriesDataFrame format, specifying id_column and timestamp_column
train_set_for_autogluon = TimeSeriesDataFrame.from_data_frame(
    df=train_set_for_autogluon,
    id_column='id',
    timestamp_column='date'
)

train_set_for_autogluon = reduce_mem_usage(train_set_for_autogluon)
train_set_for_autogluon.to_csv("train_set_for_autogluon.csv", index=True)

# Step 1: Initialize the TimeSeriesPredictor with a primary eval_metric
predictor = TimeSeriesPredictor(
    prediction_length=28,
    target="sales",
    known_covariates_names=['dept_id', 'store_id','wday', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'event_name_1','event_type_1','event_name_2', 'event_type_2'] ,
    eval_metric="RMSSE"  # Primary metric for training
)

# Define custom lags
# custom_lags = list(range(28, 43))  # Lags from 28 to 42 days

# Define short-term, weekly and monthly lags
custom_lags = [1, 2, 3, 4, 5, 6, 7, 14, 21, 28, 56, 84] 


# Define LightGBM parameters
lgbm_params = {
                    'boosting_type': 'gbdt',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample': 0.5,
                    'subsample_freq': 1,
                    'learning_rate': 0.015,
                    'num_leaves': 2**11-1,
                    'min_data_in_leaf': 2**12-1,
                    'feature_fraction': 0.5,
                    'max_bin': 100,
                    'n_estimators': 3000,
                    'boost_from_average': False,
                    'verbose': -1,
                }


# Step 2: Fit the predictor with all models in the model zoo using default hyperparameters
predictor.fit(
    train_data=train_set_for_autogluon,
    hyperparameters={
        'DirectTabular': {
            'lags': custom_lags,
            'tabular_hyperparameters': {'GBM': lgbm_params},
        },
      
        'RecursiveTabular': {
            'lags': custom_lags,
            'tabular_hyperparameters': {'GBM': lgbm_params},
        },
        'Theta': {}

    },
    time_limit=300,
    verbosity=4,
)

# Step 1: Ensure 'date' is in datetime format
validation_set['date'] = pd.to_datetime(validation_set['date'])

# Step 2: Select only the relevant columns, ensuring 'sales' is included and filled with NaN
covariate_columns = ['dept_id', 'store_id','sales', 'wday', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 
                     'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
validation_set_for_autogluon = validation_set[['id', 'date'] + covariate_columns].copy()

# Step 3: Sort the DataFrame by 'id' and 'date' to ensure correct order
validation_set_for_autogluon = validation_set_for_autogluon.sort_values(by=['id', 'date'])

# Step 4: Convert to AutoGluon TimeSeriesDataFrame, specifying `id_column` and `timestamp_column`
validation_set_for_autogluon = TimeSeriesDataFrame(
    data=validation_set_for_autogluon,
    id_column='id',
    timestamp_column='date'
)
validation_set_for_autogluon.to_csv("validation_set_for_autogluon.csv", index=True)


validation_preds = predictor.predict(train_set_for_autogluon, known_covariates=validation_set_for_autogluon)