import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo
from utils import check_data

SOIL_TYPE_MAPPING = {
    f"Soil_Type{i}": desc for i, desc in enumerate([
        "Cathedral family – Rock outcrop complex, extremely stony",
        "Vanet - Ratake families complex, very stony",
        "Haploborolis - Rock outcrop complex, rubbly",
        "Ratake family - Rock outcrop complex, rubbly",
        "Vanet family - Rock outcrop complex complex, rubbly",
        "Vanet - Wetmore families - Rock outcrop complex, stony",
        "Gothic family",
        "Supervisor - Limber families complex",
        "Troutville family, very stony",
        "Bullwark - Catamount families - Rock outcrop complex, rubbly",
        "Bullwark - Catamount families - Rock land complex, rubbly",
        "Legault family - Rock land complex, stony",
        "Catamount family - Rock land - Bullwark family complex, rubbly",
        "Pachic Argiborolis - Aquolis complex",
        "unspecified in the USFS Soil and ELU Survey",
        "Cryaquolis - Cryoborolis complex",
        "Gateview family - Cryaquolis complex",
        "Rogert family, very stony",
        "Typic Cryaquolis - Borohemists complex",
        "Typic Cryaquepts - Typic Cryaquolls complex",
        "Typic Cryaquolls - Leighcan family, till substratum complex",
        "Leighcan family, till substratum, extremely bouldery",
        "Leighcan family, till substratum - Typic Cryaquolls complex",
        "Leighcan family, extremely stony",
        "Leighcan family, warm, extremely stony",
        "Granile - Catamount families complex, very stony",
        "Leighcan family, warm - Rock outcrop complex, extremely stony",
        "Leighcan family - Rock outcrop complex, extremely stony",
        "Como - Legault families complex, extremely stony",
        "Como family - Rock land - Legault family complex, extremely stony",
        "Leighcan - Catamount families complex, extremely stony",
        "Catamount family - Rock outcrop - Leighcan family complex, extremely stony",
        "Leighcan - Catamount families - Rock outcrop complex, extremely stony",
        "Cryorthents - Rock land complex, extremely stony",
        "Cryumbrepts - Rock outcrop - Cryaquepts complex",
        "Bross family - Rock land - Cryumbrepts complex, extremely stony",
        "Rock outcrop - Cryumbrepts - Cryorthents complex, extremely stony",
        "Leighcan - Moran families - Cryaquolls complex, extremely stony",
        "Moran family - Cryorthents - Leighcan family complex, extremely stony",
        "Moran family - Cryorthents - Rock land complex, extremely stony"
    ], start=1)
}

WILDERNESS_MAPPING = {
    "Wilderness_Area1": "Rawah Wilderness Area",
    "Wilderness_Area2": "Neota Wilderness Area",
    "Wilderness_Area3": "Comanche Peak Wilderness Area",
    "Wilderness_Area4": "Cache la Poudre Wilderness Area"
}

################################################################################
# (1) DATA LOADING
################################################################################
print("=" * 50)
print("STEP 1: DATA LOADING")
print("=" * 50)

# Set paths
base_dir = os.path.dirname(__file__)
RAW_PATH = os.path.join(base_dir, 'raw.csv')
OUT_PATH = os.path.join(base_dir, 'covertype.csv')

print("Fetching data...")
data_instance = fetch_ucirepo(id=31) 
print("Data fetched")

# data (as pandas dataframes) 
X = data_instance.data.features
y = data_instance.data.targets


df = pd.concat([X, y], axis=1)
df.to_csv(RAW_PATH, index=False)

print("STEP 1 COMPLETED: Data loaded and raw file saved")
print("=" * 50)


################################################################################
# (2) FORMAT
################################################################################
print("STEP 2: FORMAT")
print("=" * 50)

# Rename target column to 'label'
if 'Cover_Type' in df.columns:
    df = df.rename(columns={'Cover_Type': 'label'})
    print("Renamed 'Cover_Type' column to 'label'.")

# Filter for specific classes
df = df[df['label'].isin([2, 4])]
df['label'] = df['label'].map({2: 0, 4: 1})

print("STEP 2 COMPLETED: Data formatted and cleaned")
print("=" * 50)


################################################################################
# (3) VALIDATION
################################################################################
print("STEP 3: VALIDATION")
print("=" * 50)

# validate
df = check_data(df)

if df is None:
    print("ERROR: Data validation failed!")
    exit()

print("STEP 3 COMPLETED: Data validation passed")
print("=" * 50)


################################################################################
# (4) POSTPROCESSING & SAVE
################################################################################
print("STEP 4: POSTPROCESSING & SAVE")
print("=" * 50)

# Recover functions with error on failure
def recover_wilderness_area(row):
    for col, name in WILDERNESS_MAPPING.items():
        if row.get(col, 0) == 1:
            return name
    raise ValueError(f"Wilderness_Area not found in row index {row.name}")

def recover_soil_type(row):
    for col, name in SOIL_TYPE_MAPPING.items():
        if row.get(col, 0) == 1:
            return name
    raise ValueError(f"Soil_Type not found in row index {row.name}")

# Apply recovery
print("Applying recovery functions...")
df['Wilderness_Area'] = df.apply(recover_wilderness_area, axis=1)
df['Soil_type'] = df.apply(recover_soil_type, axis=1)

print("Dropping original one-hot columns...")
# Drop original one-hot columns
cols_to_drop = [col for col in df.columns if col.startswith('Wilderness_Area') or col.startswith('Soil_Type')]
cols_to_drop = [col for col in cols_to_drop if col not in ['Wilderness_Area', 'Soil_type']]
df.drop(columns=cols_to_drop, inplace=True)

print("Reordering columns...")
# Reorder columns
cols = list(df.columns)
cols.remove('Wilderness_Area')
cols.remove('Soil_type')
cols.remove('label')
new_order = cols + ['Wilderness_Area', 'Soil_type', 'label']
df = df[new_order]

print("final shape: ", df.shape)

# Save processed file
df.to_csv(OUT_PATH, index=False)
print(f"Saved cleaned dataset to {OUT_PATH}")

print("STEP 4 COMPLETED: Final data saved")
print("=" * 50)
print("ALL PREPROCESSING STEPS COMPLETED!")
print("=" * 50)
