#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Guide users to download the Implicit Hate Speech dataset and extract it.
This dataset requires completing a survey before downloading.
"""

import os
import sys
import logging
import zipfile
from pathlib import Path
from tqdm import tqdm

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler()]
)
logger = logging.getLogger("Implicit-Hate-Downloader")

# GitHub Repository URL
REPO_URL = "https://github.com/SALT-NLP/implicit-hate?tab=readme-ov-file"

# Directories
ROOT_DIR = Path(__file__).parent.parent
RAW_DATA_DIR = ROOT_DIR / "data" / "raw" / "implicit_hate"
ZIP_FILE_PATH = RAW_DATA_DIR / "implicit-hate-corpus-nov-2021.zip"

def create_directories():
    """Create necessary directories if they don't exist."""
    RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)
    logger.info(f"Created directory: {RAW_DATA_DIR}")

def display_instructions():
    """Display instructions for downloading the Implicit Hate Speech dataset."""
    logger.info("\n" + "="*80)
    logger.info("IMPLICIT HATE SPEECH DATASET DOWNLOAD INSTRUCTIONS")
    logger.info("="*80)
    logger.info(f"This dataset is hosted on GitHub: {REPO_URL}")
    logger.info("\nTo access the dataset, please follow these steps:")
    logger.info("1. Visit the repository URL: https://github.com/SALT-NLP/implicit-hate")
    logger.info("2. Complete the required survey (linked in the README)")
    logger.info("3. After completing the survey, you will receive a link to download the dataset")
    logger.info(f"4. Download the dataset and save it as: {ZIP_FILE_PATH}")
    logger.info("   (it's approximately 2MB, expands to 6MB)")
    
    logger.info("\nDATASET INFORMATION:")
    logger.info("- Contains 22,056 tweets with 6,346 implicit hate speech examples")
    logger.info("- Each implicit hate tweet has annotations for target demographic group")
    logger.info("- Contains implied statements to explain the underlying message")
    logger.info("- Categorized into: Grievance, Incitement, Inferiority, Irony,")
    logger.info("  Stereotypes, Threats, and Other")
    
    logger.info("\nCITATION:")
    logger.info("ElSherief, M., Ziems, C., Muchlinski, D., Anupindi, V., Seybolt, J.,")
    logger.info("De Choudhury, M., & Yang, D. (2021). Latent Hatred: A Benchmark for")
    logger.info("Understanding Implicit Hate Speech. In Proceedings of the 2021 Conference")
    logger.info("on Empirical Methods in Natural Language Processing (EMNLP).")
    logger.info("="*80)

def verify_zip_file():
    """Verify that the ZIP file exists."""
    if ZIP_FILE_PATH.exists():
        logger.info(f"Found ZIP file: {ZIP_FILE_PATH}")
        return True
    else:
        logger.error(f"ZIP file not found: {ZIP_FILE_PATH}")
        return False

def extract_zip():
    """Extract the ZIP file to the raw data directory."""
    try:
        logger.info(f"Extracting {ZIP_FILE_PATH} to {RAW_DATA_DIR}")
        
        with zipfile.ZipFile(ZIP_FILE_PATH, 'r') as zip_ref:
            # Get list of file names in the zip
            file_list = zip_ref.namelist()
            logger.info(f"Found {len(file_list)} files in ZIP archive")
            
            # Extract all files with a progress bar
            for file in tqdm(file_list, desc="Extracting files"):
                zip_ref.extract(file, RAW_DATA_DIR)
        
        logger.info("Extraction completed successfully")
        return True
    except Exception as e:
        logger.error(f"Error extracting ZIP file: {e}")
        return False

def main():
    """Main function to guide users to download and extract the Implicit Hate Speech dataset."""
    logger.info("Preparing for Implicit Hate Speech dataset download and extraction")
    
    # Create necessary directories
    create_directories()
    
    # Check if files are already extracted
    if (RAW_DATA_DIR / "implicit-hate-corpus-nov-2021").exists():
        logger.info("Dataset files appear to be already extracted")
        logger.info("You can proceed with processing the data using the process_implicit_hate.py script")
        return 0
    
    # Check if the ZIP file exists
    if verify_zip_file():
        logger.info("ZIP file found. Will proceed with extraction.")
        if extract_zip():
            logger.info("Dataset extraction completed successfully!")
            logger.info("You can now proceed with processing the data using the process_implicit_hate.py script")
            return 0
        else:
            logger.error("Failed to extract the dataset")
            return 1
    else:
        # Display download instructions if ZIP file not found
        display_instructions()
        
        # Prompt user to confirm when they've completed the download
        print("\nHave you downloaded the dataset and placed it at the specified location? (yes/no): ", end="")
        response = input().strip().lower()
        
        if response in ['y', 'yes']:
            if verify_zip_file():
                logger.info("ZIP file found. Will proceed with extraction.")
                if extract_zip():
                    logger.info("Dataset extraction completed successfully!")
                    logger.info("You can now proceed with processing the data using the process_implicit_hate.py script")
                    return 0
                else:
                    logger.error("Failed to extract the dataset")
                    return 1
            else:
                logger.error("Could not find the ZIP file at the specified location")
                logger.info("Please download the dataset and run this script again")
                return 1
        else:
            logger.info("Please download the dataset when you're ready and run this script again")
            return 1

if __name__ == "__main__":
    sys.exit(main()) 