import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo
from utils import check_data


################################################################################
# (1) DATA LOADING
################################################################################
print("=" * 50)
print("STEP 1: DATA LOADING")
print("=" * 50)

# Set paths
base_dir = os.path.dirname(__file__)
RAW_PATH = os.path.join(base_dir, 'raw.csv')
OUT_PATH = os.path.join(base_dir, 'backdoor.csv')

# Set paths for input files
train_path = 'UNSW_NB15_training-set.csv'
test_path = 'UNSW_NB15_testing-set.csv'

# Load CSV files
print("Loading training data...")
train_df = pd.read_csv(os.path.join(base_dir, train_path))
print("Loading test data...")
test_df = pd.read_csv(os.path.join(base_dir, test_path))

# Check data shapes
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

# Combine two datasets
print("Combining training and test datasets...")
raw_df = pd.concat([train_df, test_df], ignore_index=True)

# Save raw data
raw_df.to_csv(RAW_PATH, index=False)
df = pd.read_csv(RAW_PATH)

print("STEP 1 COMPLETED: Data loaded and raw file saved")
print("=" * 50)


################################################################################
# (2) FORMAT
################################################################################
print("STEP 2: FORMAT")
print("=" * 50)

# Drop unnecessary columns
print("Dropping unnecessary columns...")
df.drop(columns=['label', 'id'], inplace=True)


# Convert 'is_' columns from 0/1 to No/Yes
print(f"\nConverting 'is_' columns...")
is_columns = [col for col in df.columns if col.startswith('is_')]
print(f"Found 'is_' columns: {is_columns}")

for col in is_columns:
    df[col] = df[col].map({0: 'No', 1: 'Yes'})
    print(f"Converted {col}: {df[col].value_counts().to_dict()}")

# Filter data to only include Normal and Backdoor attack categories
print("Filtering data to Normal and Backdoor categories...")
df = df[df['attack_cat'].isin(['Normal', 'Backdoor'])].copy()

# Convert target column to 'label' and map values
print("Converting target column to 'label'...")
df['label'] = df['attack_cat'].map({'Normal': 0, 'Backdoor': 1})
df.drop(columns=['attack_cat'], inplace=True)

# # Handle unknown strings and convert to NaN
# print("Handling unknown strings...")
# df.replace(" -", pd.NA, inplace=True)
# df.replace("-", pd.NA, inplace=True)

# # Drop rows with NaN values
# print("Dropping rows with NaN values...")
# df = df.dropna()

print("STEP 2 COMPLETED: Data formatted and cleaned")
print("=" * 50)


################################################################################
# (3) VALIDATION
################################################################################
print("STEP 3: VALIDATION")
print("=" * 50)

# Validate data using check_data function
df = check_data(df)

if df is None:
    print("ERROR: Data validation failed!")
    exit()

print("STEP 3 COMPLETED: Data validation passed")
print("=" * 50)


################################################################################
# (4) POSTPROCESSING & SAVE
################################################################################
print("STEP 4: POSTPROCESSING & SAVE")
print("=" * 50)

# Save final processed data
df.to_csv(OUT_PATH, index=False)

print("STEP 4 COMPLETED: Final data saved")
print("=" * 50)
print("ALL PREPROCESSING STEPS COMPLETED!")
print("=" * 50)
