#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Download and extract the Social Bias Frames dataset.
The dataset contains annotations of social bias in text across various categories.
"""

import os
import sys
import logging
import requests
import tarfile
import shutil
from pathlib import Path
from tqdm import tqdm

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler()]
)
logger = logging.getLogger("SBIC-Downloader")

# URL for the Social Bias Frames dataset
DATASET_URL = "https://maartensap.com/social-bias-frames/SBIC.v2.tgz"

# Directories
ROOT_DIR = Path(__file__).parent.parent
RAW_DATA_DIR = ROOT_DIR / "data" / "raw" / "sbic"
DOWNLOAD_FILE = RAW_DATA_DIR / "SBIC.v2.tgz"

def download_file(url, output_path):
    """
    Download a file from a URL with progress tracking.
    
    Args:
        url: URL to download from
        output_path: Path to save the downloaded file
        
    Returns:
        True if successful, False otherwise
    """
    try:
        # Create directory if it doesn't exist
        output_path.parent.mkdir(parents=True, exist_ok=True)
        
        # Check if file already exists
        if output_path.exists():
            logger.info(f"File already exists: {output_path}")
            return True
        
        logger.info(f"Downloading from {url} to {output_path}")
        
        # Make HTTP request with stream=True for large files
        response = requests.get(url, stream=True)
        response.raise_for_status()
        
        # Get file size for progress bar
        total_size = int(response.headers.get('content-length', 0))
        block_size = 8192
        
        # Download with progress bar
        with open(output_path, 'wb') as f:
            with tqdm(total=total_size, unit='B', unit_scale=True, desc="Downloading") as pbar:
                for chunk in response.iter_content(chunk_size=block_size):
                    if chunk:
                        f.write(chunk)
                        pbar.update(len(chunk))
        
        logger.info(f"Download completed: {output_path}")
        return True
    
    except Exception as e:
        logger.error(f"Error downloading file: {e}")
        return False

def extract_tgz(tgz_path, extract_dir):
    """
    Extract a .tgz file to the specified directory.
    
    Args:
        tgz_path: Path to the .tgz file
        extract_dir: Directory to extract to
        
    Returns:
        True if successful, False otherwise
    """
    try:
        logger.info(f"Extracting {tgz_path} to {extract_dir}")
        
        # Create extract directory if it doesn't exist
        extract_dir.mkdir(parents=True, exist_ok=True)
        
        # Extract the tar.gz file
        with tarfile.open(tgz_path, 'r:gz') as tar:
            # Get list of all members for progress tracking
            members = tar.getmembers()
            
            # Extract with progress bar
            for member in tqdm(members, desc="Extracting"):
                tar.extract(member, path=extract_dir)
        
        logger.info(f"Extraction completed to {extract_dir}")
        return True
    
    except Exception as e:
        logger.error(f"Error extracting file: {e}")
        return False

def verify_dataset():
    """
    Verify that the dataset was extracted correctly.
    
    Returns:
        True if successful, False otherwise
    """
    # Expected files in the dataset
    expected_files = [
        "LICENSE",
        "README.md",
        "SBIC.v2.agg.dev.csv",
        "SBIC.v2.agg.trn.csv",
        "SBIC.v2.agg.tst.csv",
        "SBIC.v2.dev.csv",
        "SBIC.v2.trn.csv",
        "SBIC.v2.tst.csv"
    ]
    
    missing_files = []
    
    # Check if all expected files exist
    for file_name in expected_files:
        file_path = RAW_DATA_DIR / file_name
        if not file_path.exists():
            missing_files.append(file_name)
    
    if missing_files:
        logger.error(f"Missing files: {', '.join(missing_files)}")
        return False
    
    logger.info("All expected files are present")
    return True

def main():
    """
    Main function to download and extract the Social Bias Frames dataset.
    """
    logger.info("Starting download of Social Bias Frames dataset")
    
    # Download the dataset
    if not download_file(DATASET_URL, DOWNLOAD_FILE):
        logger.error("Failed to download the dataset")
        return 1
    
    # Extract the dataset
    if not extract_tgz(DOWNLOAD_FILE, RAW_DATA_DIR):
        logger.error("Failed to extract the dataset")
        return 1
    
    # Verify the dataset
    if not verify_dataset():
        logger.error("Dataset verification failed")
        return 1
    
    logger.info("Social Bias Frames dataset download and extraction completed successfully")
    return 0

if __name__ == "__main__":
    sys.exit(main()) 