#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Download the P-Stance dataset from Google Drive.
P-Stance is a large dataset for stance detection in the political domain.
"""

import os
import sys
import logging
import shutil
import zipfile
import requests
import tempfile
from pathlib import Path
from tqdm import tqdm
import pandas as pd

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler()]
)
logger = logging.getLogger("P-Stance-Downloader")

# Directories
ROOT_DIR = Path(__file__).parent.parent
RAW_DATA_DIR = ROOT_DIR / "data" / "raw" / "pstance"
ZIP_FILE_PATH = RAW_DATA_DIR / "p-stance.zip"

# Expected CSV files after extraction
EXPECTED_FILES = [
    "raw_train_trump.csv",
    "raw_test_trump.csv",
    "raw_val_trump.csv",
    "raw_train_biden.csv",
    "raw_test_biden.csv", 
    "raw_val_biden.csv",
    "raw_train_bernie.csv",
    "raw_test_bernie.csv",
    "raw_val_bernie.csv"
]

def extract_zip(zip_path, extract_dir):
    """Extract a zip file to the specified directory."""
    try:
        logger.info(f"Extracting {zip_path} to {extract_dir}")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            # Get list of file names in the zip
            file_list = zip_ref.namelist()
            logger.info(f"Found {len(file_list)} files in zip archive")
            
            # Extract all files
            for file in tqdm(file_list, desc="Extracting"):
                zip_ref.extract(file, extract_dir)
        
        logger.info("Extraction completed successfully")
        return True
    except Exception as e:
        logger.error(f"Error extracting zip file: {e}")
        return False

def verify_csv_files():
    """Verify that all expected CSV files exist and are valid."""
    missing_files = []
    invalid_files = []
    
    for file_name in EXPECTED_FILES:
        file_path = RAW_DATA_DIR / file_name
        
        # Check if file exists
        if not file_path.exists():
            missing_files.append(file_name)
            continue
        
        # Verify CSV file
        try:
            df = pd.read_csv(file_path)
            logger.info(f"Verified CSV file {file_name}: {len(df)} rows")
        except Exception as e:
            logger.error(f"Error verifying CSV file {file_name}: {e}")
            invalid_files.append(file_name)
    
    if missing_files:
        logger.warning(f"Missing files: {', '.join(missing_files)}")
    
    if invalid_files:
        logger.warning(f"Invalid files: {', '.join(invalid_files)}")
    
    return not (missing_files or invalid_files)

def provide_manual_instructions():
    """Provide instructions for manual download."""
    logger.info("\n" + "="*50)
    logger.info("MANUAL DOWNLOAD INSTRUCTIONS")
    logger.info("="*50)
    logger.info("To manually download the P-Stance dataset:")
    logger.info("1. Navigate to: https://drive.google.com/drive/folders/1so8lY1XKpnhUtTvb15edEz6aeHt7CSuh")
    logger.info("2. Download all CSV files and create a zip file named 'p-stance.zip'")
    logger.info("   containing the following files:")
    for file in EXPECTED_FILES:
        logger.info(f"   - {file}")
    logger.info(f"3. Place the zip file at: {ZIP_FILE_PATH}")
    logger.info("="*50)

def process_zip_file():
    """Process the zip file if it exists."""
    if not ZIP_FILE_PATH.exists():
        logger.error(f"Zip file not found: {ZIP_FILE_PATH}")
        return False
    
    # Extract the zip file
    success = extract_zip(ZIP_FILE_PATH, RAW_DATA_DIR)
    if not success:
        return False
    
    # Verify the extracted files
    return verify_csv_files()

def download_dataset():
    """Download the P-Stance dataset."""
    # Create raw data directory if it doesn't exist
    RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)
    
    # Check if files are already downloaded
    if all(Path(RAW_DATA_DIR / file).exists() for file in EXPECTED_FILES):
        logger.info("All expected files already exist. Verifying...")
        if verify_csv_files():
            logger.info("All files are valid. No need to download again.")
            return True
        else:
            logger.warning("Some files are invalid. Will attempt to download again.")
    
    # Check if zip file exists
    if ZIP_FILE_PATH.exists():
        logger.info(f"Found zip file: {ZIP_FILE_PATH}")
        logger.info("Extracting files from the zip archive...")
        if process_zip_file():
            logger.info("Successfully extracted all files from the zip archive.")
            return True
        else:
            logger.warning("Failed to process the zip file.")
    
    # Provide manual download instructions
    logger.info("NOTE: Please download the P-Stance dataset manually as a zip file.")
    provide_manual_instructions()
    
    # Ask user if they want to proceed with manual download
    while True:
        response = input("\nHave you downloaded the p-stance.zip file? (yes/no): ").strip().lower()
        if response in ['y', 'yes']:
            logger.info("Processing the zip file...")
            if process_zip_file():
                logger.info("All files extracted and verified successfully!")
                return True
            else:
                logger.warning("Failed to process the zip file.")
                provide_manual_instructions()
        elif response in ['n', 'no']:
            logger.info("Please download the zip file and run this script again.")
            return False
        else:
            logger.info("Please enter 'yes' or 'no'.")

def main():
    """Main function to download the P-Stance dataset."""
    logger.info("Starting P-Stance dataset download process")
    
    # Download and process the dataset
    success = download_dataset()
    
    if success:
        logger.info("P-Stance dataset download process completed successfully!")
        return 0
    else:
        logger.error("P-Stance dataset download process failed.")
        return 1

if __name__ == "__main__":
    sys.exit(main()) 