#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Download the Article-Bias-Prediction dataset from GitHub.
This script clones the repository and prepares the data for processing.
"""

import os
import sys
import logging
import subprocess
import shutil
from pathlib import Path
from tqdm import tqdm

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler()]
)
logger = logging.getLogger("Article-Bias-Downloader")

# GitHub Repository URL
REPO_URL = "https://github.com/ramybaly/Article-Bias-Prediction.git"

# Directories
ROOT_DIR = Path(__file__).parent.parent
RAW_DATA_DIR = ROOT_DIR / "data" / "raw" / "article_bias"
REPO_DIR = RAW_DATA_DIR / "Article-Bias-Prediction"

def create_directories():
    """Create necessary directories if they don't exist."""
    RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)
    logger.info(f"Created directory: {RAW_DATA_DIR}")

def clone_repository():
    """Clone the Article-Bias-Prediction repository."""
    if REPO_DIR.exists():
        logger.info(f"Repository already exists at {REPO_DIR}")
        logger.info("Checking for updates...")
        
        # Pull the latest changes if the repo already exists
        try:
            subprocess.run(
                ["git", "-C", str(REPO_DIR), "pull"],
                check=True,
                capture_output=True,
                text=True
            )
            logger.info("Repository updated successfully")
            return True
        except subprocess.CalledProcessError as e:
            logger.error(f"Error updating repository: {e.stderr}")
            return False
    else:
        logger.info(f"Cloning repository from {REPO_URL} to {REPO_DIR}")
        
        try:
            subprocess.run(
                ["git", "clone", REPO_URL, str(REPO_DIR)],
                check=True,
                capture_output=True,
                text=True
            )
            logger.info("Repository cloned successfully")
            return True
        except subprocess.CalledProcessError as e:
            logger.error(f"Error cloning repository: {e.stderr}")
            return False

def verify_data_structure():
    """Verify that the repository contains the expected data structure."""
    # Check for the data/splits/media directory
    splits_dir = REPO_DIR / "data" / "splits" / "media"
    
    if not splits_dir.exists():
        logger.error(f"Expected directory not found: {splits_dir}")
        return False
    
    # Check for the essential split files
    required_files = ["train.tsv", "valid.tsv", "test.tsv"]
    missing_files = [f for f in required_files if not (splits_dir / f).exists()]
    
    if missing_files:
        logger.error(f"Missing required split files: {missing_files}")
        return False
    
    # Check if the jsons directory exists
    jsons_dir = REPO_DIR / "data" / "jsons"
    if not jsons_dir.exists():
        logger.warning(f"jsons directory not found: {jsons_dir}")
        logger.warning("Dataset may not contain the full article content")
    
    logger.info("Data structure verification completed")
    return True

def copy_data_to_raw_dir():
    """Copy the relevant data to our raw data directory for processing."""
    # Create subdirectories
    splits_dir = RAW_DATA_DIR / "splits"
    splits_dir.mkdir(exist_ok=True)
    
    # Copy split files
    source_splits_dir = REPO_DIR / "data" / "splits" / "media"
    logger.info(f"Copying split files from {source_splits_dir} to {splits_dir}")
    
    for file in ["train.tsv", "valid.tsv", "test.tsv"]:
        source_file = source_splits_dir / file
        target_file = splits_dir / file
        
        if source_file.exists():
            shutil.copy2(source_file, target_file)
            logger.info(f"Copied {file}")
    
    # Check if jsons directory exists and create symbolic link if it does
    source_jsons_dir = REPO_DIR / "data" / "jsons"
    jsons_link = RAW_DATA_DIR / "jsons"
    
    if source_jsons_dir.exists() and not jsons_link.exists():
        os.symlink(source_jsons_dir, jsons_link, target_is_directory=True)
        logger.info(f"Created symbolic link to jsons directory: {jsons_link}")
    
    return True

def display_data_info():
    """Display information about the dataset."""
    # Count samples in split files
    splits_dir = RAW_DATA_DIR / "splits"
    counts = {}
    
    for file in ["train.tsv", "valid.tsv", "test.tsv"]:
        file_path = splits_dir / file
        if file_path.exists():
            with open(file_path, 'r', encoding='utf-8') as f:
                # Skip header row
                next(f, None)
                counts[file] = sum(1 for _ in f)
    
    logger.info("\nArticle-Bias-Prediction Dataset Information:")
    logger.info("------------------------------------------")
    logger.info(f"Training samples: {counts.get('train.tsv', 'N/A')}")
    logger.info(f"Validation samples: {counts.get('valid.tsv', 'N/A')}")
    logger.info(f"Test samples: {counts.get('test.tsv', 'N/A')}")
    logger.info("------------------------------------------")
    
    # Display the first few lines of the training file
    train_file = splits_dir / "train.tsv"
    if train_file.exists():
        logger.info("\nSample from training file:")
        with open(train_file, 'r', encoding='utf-8') as f:
            header = next(f, None)
            logger.info(f"Header: {header.strip()}")
            
            for i, line in enumerate(f):
                if i >= 3:  # Show only 3 samples
                    break
                logger.info(f"Sample {i+1}: {line.strip()}")

def main():
    """Main function to download the Article-Bias-Prediction dataset."""
    logger.info("Starting download of Article-Bias-Prediction dataset")
    
    # Create necessary directories
    create_directories()
    
    # Clone the repository
    if not clone_repository():
        logger.error("Failed to clone repository")
        return 1
    
    # Verify the data structure
    if not verify_data_structure():
        logger.error("Data structure verification failed")
        return 1
    
    # Copy data to our raw data directory
    if not copy_data_to_raw_dir():
        logger.error("Failed to copy data")
        return 1
    
    # Display information about the dataset
    display_data_info()
    
    logger.info("Article-Bias-Prediction dataset download completed successfully")
    logger.info("You can now proceed with processing the data using the process_article_bias.py script")
    return 0

if __name__ == "__main__":
    sys.exit(main()) 