import os
import zipfile
import shutil
import pandas as pd
import glob
import csv


def prepare_aerial_cactus(directory):
    # Paths to zip files
    train_zip = os.path.join(directory, 'train.zip')
    test_zip = os.path.join(directory, 'test.zip')
    sample_csv = os.path.join(directory, 'sample_submission.csv')
    test_csv = os.path.join(directory, 'test.csv')

    # Unzip train.zip
    if os.path.exists(train_zip):
        with zipfile.ZipFile(train_zip, 'r') as zip_ref:
            zip_ref.extractall(directory)
        os.remove(train_zip)

    # Unzip test.zip
    if os.path.exists(test_zip):
        with zipfile.ZipFile(test_zip, 'r') as zip_ref:
            zip_ref.extractall(directory)
        os.remove(test_zip)

    # Create test.csv from sample_submission.csv, removing the 'has_cactus' column
    if os.path.exists(sample_csv):
        df = pd.read_csv(sample_csv)
        if 'has_cactus' in df.columns:
            df = df.drop(columns=['has_cactus'])
        df.to_csv(test_csv, index=False)


def prepare_cmi_detect(directory):
    test_csv = os.path.join(directory, 'test.csv')
    sample_submission = os.path.join(directory, 'sample_submission.csv')
    test = pd.read_csv(test_csv)
    submission = pd.DataFrame({
        "sequence_id": test["sequence_id"].unique(),
        "gesture": "This is a stub. Please evaluate it using kaggle_evaluation."
    })
    submission.to_csv(sample_submission, index=False)

def prepare_denoising_dirty(directory):
    # 1. Unzip files
    zip_files = [
        'train.zip',
        'test.zip',
        'sampleSubmission.csv.zip',
        'train_cleaned.zip'
    ]
    for zf in zip_files:
        zip_path = os.path.join(directory, zf)
        if os.path.exists(zip_path):
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(directory)
            os.remove(zip_path)

    # 2. Rename sampleSubmission.csv -> sample_submission.csv
    old_sample = os.path.join(directory, 'sampleSubmission.csv')
    new_sample = os.path.join(directory, 'sample_submission.csv')
    if os.path.exists(old_sample):
        os.rename(old_sample, new_sample)

    # 3. Create test.csv by removing 'value' column from sample_submission.csv
    if os.path.exists(new_sample):
        df = pd.read_csv(new_sample)
        if 'value' in df.columns:
            df = df.drop(columns=['value'])
        test_csv_path = os.path.join(directory, 'test.csv')
        df.to_csv(test_csv_path, index=False)

    # 4. Generate train.csv with paths to files in train and train_cleaned
    train_dir = os.path.join(directory, 'train')
    train_cleaned_dir = os.path.join(directory, 'train_cleaned')
    if os.path.isdir(train_dir) and os.path.isdir(train_cleaned_dir):
        filenames = sorted(os.listdir(train_dir))
        data = []
        for fname in filenames:
            train_path = os.path.join('train', fname)
            train_cleaned_path = os.path.join('train_cleaned', fname)
            if os.path.exists(os.path.join(directory, train_cleaned_path)):
                data.append({
                    'train': train_path,
                    'train_cleaned': train_cleaned_path
                })
        train_csv_path = os.path.join(directory, 'train.csv')
        pd.DataFrame(data).to_csv(train_csv_path, index=False)


def prepare_dog_breed_identification(directory):
    # 2. Create test.csv from sample_submission.csv (keep only the 'id' column)
    sample_sub_path = os.path.join(directory, 'sample_submission.csv')
    test_csv_path = os.path.join(directory, 'test.csv')
    if os.path.exists(sample_sub_path):
        df = pd.read_csv(sample_sub_path)
        df[['id']].to_csv(test_csv_path, index=False)
    else:
        print(f"File sample_submission.csv not found in {directory}")

    # 3. Copy labels.csv to train.csv
    labels_path = os.path.join(directory, 'labels.csv')
    train_csv_path = os.path.join(directory, 'train.csv')
    if os.path.exists(labels_path):
        shutil.copy(labels_path, train_csv_path)
    else:
        print(f"File labels.csv not found in {directory}")


def prepare_dogs_vs_cats_kernels(directory):
    # 1. Unzip train.zip and test.zip
    for zip_name in ['train.zip', 'test.zip']:
        zip_path = os.path.join(directory, zip_name)
        if os.path.exists(zip_path):
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(directory)
            os.remove(zip_path)

    # 2. Create test.csv from sample_submission.csv (remove the 'label' column)
    sample_csv = os.path.join(directory, 'sample_submission.csv')
    test_csv = os.path.join(directory, 'test.csv')
    if os.path.exists(sample_csv):
        df = pd.read_csv(sample_csv)
        if 'label' in df.columns:
            df = df.drop(columns=['label'])
        df.to_csv(test_csv, index=False)

    # 3. Create train.csv with image paths and labels
    train_dir = os.path.join(directory, 'train')
    train_files = glob.glob(os.path.join(train_dir, '*'))
    data = []
    for file_path in train_files:
        filename = os.path.basename(file_path)
        if 'dog' in filename.lower():
            label = 'dog'
        elif 'cat' in filename.lower():
            label = 'cat'
        else:
            continue  # Skip files without dog/cat in the name
        data.append({'path': file_path, 'label': label})
    train_df = pd.DataFrame(data)
    train_df.to_csv(os.path.join(directory, 'train.csv'), index=False)


def prepare_jigsaw_toxic(directory):
    filenames = ['train.csv.zip', 'test.csv.zip', 'sample_submission.csv.zip']
    for filename in filenames:
        zip_path = os.path.join(directory, filename)
        if os.path.exists(zip_path):
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(directory)
            os.remove(zip_path)
            print(f"Unzipped and removed: {filename}")
        else:
            print(f"File not found: {filename}")


def prepare_leaf_classification(directory):
    # List of files to unzip
    zip_files = [
        'images.zip',
        'train.csv.zip',
        'test.csv.zip',
        'sample_submission.csv.zip'
    ]

    for zip_name in zip_files:
        zip_path = os.path.join(directory, zip_name)
        if os.path.exists(zip_path):
            # Unzip the file
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(directory)
            # Delete the zip file
            os.remove(zip_path)
        else:
            print(f"File {zip_name} not found in {directory}")


def prepare_nomad(directory):
    # List of files to unzip and delete
    zip_files = [
        "train.csv.zip",
        "test.csv.zip",
        "sample_submission.csv.zip",
        "test.zip",
        "train.zip"
    ]

    for zip_name in zip_files:
        zip_path = os.path.join(directory, zip_name)
        if os.path.exists(zip_path):
            try:
                with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                    zip_ref.extractall(directory)
                os.remove(zip_path)
                print(f"Unzipped and removed: {zip_name}")
            except zipfile.BadZipFile:
                print(f"Bad zip file: {zip_name}")
        else:
            print(f"File not found: {zip_name}")


def process_pizza(directory_path):
    # Delete all zip files
    for filename in os.listdir(directory_path):
        if filename.endswith('.zip'):
            zip_path = os.path.join(directory_path, filename)
            os.remove(zip_path)
            print(f"Deleted file: {zip_path}")

    # Process test.json and train.json
    for json_name in ['test.json', 'train.json']:
        json_path = os.path.join(directory_path, json_name)
        csv_name = json_name.replace('.json', '.csv')
        csv_path = os.path.join(directory_path, csv_name)
        if os.path.exists(json_path):
            try:
                df = pd.read_json(json_path)
                df.to_csv(csv_path, index=False)
                print(f"Created file: {csv_path}")
            except Exception as e:
                print(f"Error processing {json_path}: {e}")


def prepare_spooky(directory):
    # List of files to unzip and delete
    zip_files = [
        "train.zip",
        "test.zip",
        "sample_submission.zip",
    ]

    for zip_name in zip_files:
        zip_path = os.path.join(directory, zip_name)
        if os.path.exists(zip_path):
            try:
                with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                    zip_ref.extractall(directory)
                os.remove(zip_path)
                print(f"Unzipped and removed: {zip_name}")
            except zipfile.BadZipFile:
                print(f"Bad zip file: {zip_name}")
        else:
            print(f"File not found: {zip_name}")


def prepare_text_normalization_en(directory):
    zip_files = [
        "en_train.csv.zip",
        "en_test_2.csv.zip",
        "en_sample_submission_2.csv.zip"
    ]

    for zip_name in zip_files:
        zip_path = os.path.join(directory, zip_name)
        if os.path.exists(zip_path):
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(directory)
            print(f"Unzipped: {zip_name}")
        else:
            print(f"File not found: {zip_name}")

    for filename in os.listdir(directory):
        if filename.endswith('.zip'):
            file_path = os.path.join(directory, filename)
            os.remove(file_path)
            print(f"Deleted: {filename}")

    os.rename(os.path.join(directory, 'en_train.csv'), os.path.join(directory, 'train.csv'))
    os.rename(os.path.join(directory, 'en_test_2.csv'), os.path.join(directory, 'test.csv'))
    os.rename(os.path.join(directory, 'en_sample_submission_2.csv'), os.path.join(directory, 'sample_submission.csv'))


def prepare_text_normalization_ru(directory):
    zip_files = [
        "ru_train.csv.zip",
        "ru_test_2.csv.zip",
        "ru_sample_submission_2.csv.zip"
    ]

    for zip_name in zip_files:
        zip_path = os.path.join(directory, zip_name)
        if os.path.exists(zip_path):
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(directory)
            print(f"Unzipped: {zip_name}")
        else:
            print(f"File not found: {zip_name}")

    for filename in os.listdir(directory):
        if filename.endswith('.zip'):
            file_path = os.path.join(directory, filename)
            os.remove(file_path)
            print(f"Deleted: {filename}")

    os.rename(os.path.join(directory, 'ru_train.csv'), os.path.join(directory, 'train.csv'))
    os.rename(os.path.join(directory, 'ru_test_2.csv'), os.path.join(directory, 'test.csv'))
    os.rename(os.path.join(directory, 'ru_sample_submission_2.csv'), os.path.join(directory, 'sample_submission.csv'))


def prepare_make_data_count(dir_path):
    # 1. Create test.csv from sample_submission.csv by removing the 'type' column
    sample_submission_path = os.path.join(dir_path, 'sample_submission.csv')
    test_csv_path = os.path.join(dir_path, 'test.csv')
    if os.path.exists(sample_submission_path):
        df = pd.read_csv(sample_submission_path)
        if 'type' in df.columns:
            df = df.drop(columns=['type'])
        df.to_csv(test_csv_path, index=False)
        print(f"File test.csv created in {test_csv_path}")
    else:
        print(f"File sample_submission.csv not found in {dir_path}")

    # 2. Rename train_labels.csv to train.csv
    train_labels_path = os.path.join(dir_path, 'train_labels.csv')
    train_csv_path = os.path.join(dir_path, 'train.csv')
    if os.path.exists(train_labels_path):
        os.rename(train_labels_path, train_csv_path)
        print(f"File train_labels.csv renamed to train.csv")
    else:
        print(f"File train_labels.csv not found in {dir_path}")


def prepare_wsdm_cup(directory_path):
    train_parquet = os.path.join(directory_path, 'train.parquet')
    test_parquet = os.path.join(directory_path, 'test.parquet')
    train_csv = os.path.join(directory_path, 'train.csv')
    test_csv = os.path.join(directory_path, 'test.csv')

    # Check for file existence
    if not os.path.exists(train_parquet):
        print(f"File {train_parquet} not found.")
        return
    if not os.path.exists(test_parquet):
        print(f"File {test_parquet} not found.")
        return

    # Read and save train
    df_train = pd.read_parquet(train_parquet)
    df_train.to_csv(train_csv, index=False, quoting=csv.QUOTE_ALL, escapechar='\\')
    print(f"Saved: {train_csv}")

    # Read and save test
    df_test = pd.read_parquet(test_parquet)
    df_test.to_csv(test_csv, index=False, quoting=csv.QUOTE_ALL, escapechar='\\')
    print(f"Saved: {test_csv}")


def prepare_pii(folder_path):
    # Get a list of all files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            json_path = os.path.join(folder_path, filename)
            csv_path = os.path.join(folder_path, filename.replace('.json', '.csv'))

            try:
                # Read the json file
                df = pd.read_json(json_path)
                # Save as csv
                df.to_csv(csv_path, index=False)
                print(f"Converted: {filename} -> {os.path.basename(csv_path)}")
            except Exception as e:
                print(f"Error processing {filename}: {e}")


def prepare_mlsp(directory):
    # List of files to unzip and delete
    zip_files = [
        "mlsp_contest_dataset.zip",
    ]

    for zip_name in zip_files:
        zip_path = os.path.join(directory, zip_name)
        if os.path.exists(zip_path):
            try:
                with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                    zip_ref.extractall(directory)
                os.remove(zip_path)
                print(f"Unzipped and removed: {zip_name}")
            except zipfile.BadZipFile:
                print(f"Bad zip file: {zip_name}")
        else:
            print(f"File not found: {zip_name}")