#!/usr/bin/env python3
"""
Download PowerPoint files from Internet Archive for PPTArena benchmark.

This script downloads PowerPoint files listed in files.txt.

Requirements:
    pip install requests

Usage:
    python download_data_files.py
"""

import os
import sys
from urllib.parse import urlparse, unquote
import requests

def read_file_urls(file_path: str = "files.txt") -> list[str]:
    """Read URLs from the files.txt file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            urls = [line.strip() for line in f if line.strip() and not line.startswith('#')]
        return urls
    except FileNotFoundError:
        print(f"Error: {file_path} not found!")
        sys.exit(1)

def extract_filename_from_url(url: str) -> str:
    """Extract the filename from the URL."""
    parsed = urlparse(url)
    filename = unquote(os.path.basename(parsed.path))
    
    # Handle special cases where filename might be empty or incorrect
    if not filename or filename == '/':
        # Try to extract from the URL path
        path_parts = [part for part in parsed.path.split('/') if part]
        if path_parts:
            filename = unquote(path_parts[-1])
    
    # Ensure the file has a .pptx extension
    if not filename.lower().endswith('.pptx'):
        filename += '.pptx'
    
    return filename

def download_file(url: str, filename: str, download_dir: str = "data") -> bool:
    """Download a file from the given URL."""
    os.makedirs(download_dir, exist_ok=True)
    file_path = os.path.join(download_dir, filename)
    
    # Skip if file already exists
    if os.path.exists(file_path):
        print(f"✓ {filename} already exists, skipping...")
        return True
    
    try:
        print(f"Downloading {filename}...")
        response = requests.get(url, stream=True, timeout=60)
        response.raise_for_status()
        
        total_size = int(response.headers.get('content-length', 0))
        downloaded_size = 0
        
        if file_path.endswith(".pptx.pptx"):
            file_path = file_path[:-5]  # Remove the extra .pptx
        with open(file_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
                    downloaded_size += len(chunk)
                    
                    # Show progress for larger files
                    if total_size > 0:
                        progress = (downloaded_size / total_size) * 100
                        print(f"\r  Progress: {progress:.1f}%", end='', flush=True)
        
        print(f"\n✓ Downloaded {filename}")
        return True
        
    except requests.exceptions.RequestException as e:
        print(f"\n✗ Failed to download {filename}: {e}")
        return False

def main():
    """Main function to download files."""
    print("PPTArena Data Files Downloader")
    print("=" * 40)
    
    # Read URLs from files.txt
    urls = read_file_urls()
    print(f"Found {len(urls)} files to download")
    
    # Create output directory
    os.makedirs("data", exist_ok=True)
    
    # Download files
    successful_downloads = 0
    failed_downloads = 0
    
    print("\nDownloading files...")
    print("-" * 40)
    
    for i, url in enumerate(urls, 1):
        filename = extract_filename_from_url(url)
        print(f"[{i}/{len(urls)}] {filename}")
        
        if download_file(url, filename):
            successful_downloads += 1
        else:
            failed_downloads += 1
    
    # Summary
    print("\nDownload Summary:")
    print("-" * 40)
    print(f"✓ Successful downloads: {successful_downloads}")
    print(f"✗ Failed downloads: {failed_downloads}")
    print(f"📁 Files saved to: ./data/")
    
    if failed_downloads > 0:
        print(f"\n⚠ {failed_downloads} files failed to download. You may need to:")
        print("  1. Check your internet connection")
        print("  2. Try running the script again")
        print("  3. Download failed files manually")
    
    print("\n📝 Next steps:")
    print("1. Upload each .pptx file to your OneDrive")
    print("2. Open each file in PowerPoint Online")
    print("3. Download both:")
    print("   - The .pptx file (File > Create a copy > Download a copy)")
    print("   - Images as .zip (File > Export > Export as images)")
    print("\nThis ensures proper file formatting for the benchmark evaluation.")

if __name__ == "__main__":
    main()
