import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import pandas as pd
import os
import numpy as np
from utils import check_data
import pandas as pd
from scipy.io import arff


################################################################################
# (1) DATA LOADING
################################################################################
print("=" * 50)
print("STEP 1: DATA LOADING")
print("=" * 50)

# Set paths
base_dir = os.path.dirname(__file__)
RAW_PATH = os.path.join(base_dir, 'raw.csv')
OUT_PATH = os.path.join(base_dir, 'seismic.csv')

# Load the ARFF file
print("Loading ARFF file...")
arff_data, meta = arff.loadarff(os.path.join(base_dir, "seismic-bumps.arff"))

# Convert to a Pandas DataFrame
print("Converting ARFF data to DataFrame...")
df = pd.DataFrame(arff_data)

# Convert byte strings to regular strings
print("Converting byte strings to regular strings...")
for column in df.select_dtypes([object]):
    df[column] = df[column].str.decode('utf-8')

# Save the DataFrame to a CSV file
df.to_csv(RAW_PATH, index=False)
print("Data processing complete. Saved to raw.csv")

print("STEP 1 COMPLETED: Data loaded and raw file saved")
print("=" * 50)


################################################################################
# (2) FORMAT
################################################################################
print("STEP 2: FORMAT")
print("=" * 50)

# Rename 'class' to 'label' if present
if 'class' in df.columns:
    print("Converting 'class' column to 'label'...")
    df = df.rename(columns={'class': 'label'})
    print("Renamed 'class' column to 'label'.")

print("STEP 2 COMPLETED: Data formatted and cleaned")
print("=" * 50)


################################################################################
# (3) VALIDATION
################################################################################
print("STEP 3: VALIDATION")
print("=" * 50)

# Validate data using check_data function
df = check_data(df)

if df is None:
    print("ERROR: Data validation failed!")
    exit()

print("STEP 3 COMPLETED: Data validation passed")
print("=" * 50)


################################################################################
# (4) POSTPROCESSING & SAVE
################################################################################
print("STEP 4: POSTPROCESSING & SAVE")
print("=" * 50)

# Apply value mapping for categorical columns
print("Applying value mappings for categorical columns...")
value_maps = {
    'seismic': {'a': 'lack_of_hazard', 'b': 'low_hazard', 'c': 'high_hazard', 'd': 'danger_state'},
    'seismoacoustic': {'a': 'lack_of_hazard', 'b': 'low_hazard', 'c': 'high_hazard', 'd': 'danger_state'},
    'shift': {'W': 'coal_getting', 'N': 'preparation_shift'},
    'ghazard': {'a': 'lack_of_hazard', 'b': 'low_hazard', 'c': 'high_hazard', 'd': 'danger_state'},
}

for col, mapping in value_maps.items():
    if col in df.columns:
        df[col] = df[col].map(mapping)
        print(f"Mapped values for column '{col}'")

# Save final processed data
df.to_csv(OUT_PATH, index=False)

print("STEP 4 COMPLETED: Final data saved")
print("=" * 50)
print("ALL PREPROCESSING STEPS COMPLETED!")
print("=" * 50)