import base64
import requests
import json
import urllib.request
import shutil
from pathlib import Path
import time

'''
Code to download OAI data to local2. Requires user to manually
set `username`, `password`, `packageId`, and `remote_path`.
'''

username = ''
password = ''
packageId = 1231469
remote_path = '/local2/acc/OAI/72MonthImages'
max_pages = 3200  # Set this to a positive integer to limit the number of pages, or -1 for no limit
batch_size = 50  # Number of file IDs to process in each batch for presigned URLs

# Encode credentials
credentials = base64.b64encode(f'{username}:{password}'.encode('utf-8')).decode('utf-8')

# Create headers
headers = {
    'Authorization': 'Basic ' + credentials,
    'User-Agent': 'Example Client',
    'Accept': 'application/json'
}

# Authenticate
response = requests.get('https://nda.nih.gov/api/package/auth', headers=headers)
response.raise_for_status()  # Ensure authentication was successful

# Function to retrieve files with pagination and retry logic
def retrieve_files(package_id, headers, max_pages):
    files = {}
    url = f'https://nda.nih.gov/api/package/{package_id}/files?page=1&size=100'  # Start with the first page
    pages_retrieved = 0

    while url and (max_pages == -1 or pages_retrieved < max_pages):
        for attempt in range(5):  # Retry up to 5 times with exponential backoff
            try:
                response = requests.get(url, headers=headers)
                response.raise_for_status()
                results = response.json()

                # Add files to the dictionary
                for f in results['results']:
                    files[f['package_file_id']] = {'name': f['download_alias']}

                # Get the next page link, if available
                next_link = results['_links'].get('next', {}).get('href', None)
                if next_link:
                    # Construct the full URL
                    if not next_link.startswith('http'):
                        url = 'https://nda.nih.gov' + next_link
                    else:
                        url = next_link
                else:
                    url = None
                pages_retrieved += 1
                break  # Exit the retry loop if successful

            except requests.exceptions.RequestException as e:
                print(f'Error retrieving page: {e}. Retrying ({attempt + 1}/5)...')
                time.sleep(2 ** attempt)  # Exponential backoff: 1, 2, 4, 8, 16 seconds
                if attempt == 4:  # If it's the last attempt, raise the exception
                    raise

    return files

# Function to generate presigned URLs with retry logic
def generate_presigned_urls(package_id, file_ids, headers):
    for attempt in range(5):  # Retry up to 5 times with exponential backoff
        try:
            response = requests.post(f'https://nda.nih.gov/api/package/{package_id}/files/batchGeneratePresignedUrls', json=file_ids, headers=headers)
            response.raise_for_status()
            return response.json()['presignedUrls']
        except requests.exceptions.RequestException as e:
            print(f'Error generating presigned URLs: {e}. Retrying ({attempt + 1}/5)...')
            time.sleep(2 ** attempt)  # Exponential backoff: 1, 2, 4, 8, 16 seconds
            if attempt == 4:  # If it's the last attempt, raise the exception
                raise

# Retrieve all files from the package
files = retrieve_files(packageId, headers, max_pages)

# Generate presigned URLs in smaller batches
file_ids = list(files.keys())
presigned_urls = []
for i in range(0, len(file_ids), batch_size):
    batch_ids = file_ids[i:i + batch_size]
    presigned_urls.extend(generate_presigned_urls(packageId, batch_ids, headers))

# Add download URLs to the files dictionary
for url in presigned_urls:
    files[url['package_file_id']]['download'] = url['downloadURL']

# Download files directly to the specified path
for file_id, data in files.items():
    name = data['name']
    download_url = data['download']
    local_file_path = Path(remote_path) / name

    # Ensure local directory exists
    local_dir = local_file_path.parent
    local_dir.mkdir(parents=True, exist_ok=True)

    # Check if file already exists before downloading
    if local_file_path.exists():
        print(f'File {name} already exists, skipping download.')
        continue

    try:
        # Download and save file
        with urllib.request.urlopen(download_url) as dl_file:
            with open(local_file_path, 'wb') as out_file:
                shutil.copyfileobj(dl_file, out_file)
        print(f'Downloaded and saved {name} to {local_file_path}')
    except Exception as e:
        print(f'Failed to download {name}: {e}')