"""
Spinodal Decomposition - Model Performance Benchmark Testing

Statistics for each model:
- Parameter count
- Inference time (single-step/multi-step)
- Training speed (steps/sec)
- Memory usage

Output to model_benchmark.md file

Note: Spinodal decomposition primarily demonstrates long time-step prediction capability,
so we test FluxNet-D models with different neighborhood_size values
"""

import os
import sys
import time
import torch
import numpy as np
from datetime import datetime
from typing import Dict, List

project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.insert(0, project_root)

from experiments.common.experiment_runner import create_model, ModelConfig


def count_parameters(model: torch.nn.Module) -> Dict:
    """Count model parameters"""
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return {
        'total_params': total_params,
        'trainable_params': trainable_params,
        'total_params_M': total_params / 1e6,
    }


def measure_inference_time(
    model: torch.nn.Module,
    input_shape: tuple,
    device: torch.device,
    num_warmup: int = 10,
    num_runs: int = 100
) -> Dict:
    """Measure single-step inference time"""
    model.eval()
    model.to(device)

    x = torch.randn(*input_shape).to(device)

    # Warmup
    with torch.no_grad():
        for _ in range(num_warmup):
            _ = model(x)

    if device.type == 'cuda':
        torch.cuda.synchronize()

    # Timing
    times = []
    with torch.no_grad():
        for _ in range(num_runs):
            start = time.perf_counter()
            _ = model(x)
            if device.type == 'cuda':
                torch.cuda.synchronize()
            end = time.perf_counter()
            times.append((end - start) * 1000)  # ms

    return {
        'inference_time_ms': np.mean(times),
        'inference_time_std': np.std(times),
        'throughput_samples_per_sec': input_shape[0] / (np.mean(times) / 1000),
    }


def measure_rollout_time(
    model: torch.nn.Module,
    input_shape: tuple,
    device: torch.device,
    num_steps: int = 100,
    num_runs: int = 5
) -> Dict:
    """Measure multi-step rollout time"""
    model.eval()
    model.to(device)

    times = []
    with torch.no_grad():
        for _ in range(num_runs):
            x = torch.randn(*input_shape).to(device)

            if device.type == 'cuda':
                torch.cuda.synchronize()

            start = time.perf_counter()
            for step in range(num_steps):
                outputs = model(x)
                x = outputs[0] if isinstance(outputs, tuple) else outputs
            if device.type == 'cuda':
                torch.cuda.synchronize()
            end = time.perf_counter()

            times.append((end - start) * 1000)

    total_time = np.mean(times)
    return {
        'rollout_total_ms': total_time,
        'rollout_per_step_ms': total_time / num_steps,
        'rollout_steps_per_sec': num_steps / (total_time / 1000),
    }


def measure_memory(
    model: torch.nn.Module,
    input_shape: tuple,
    device: torch.device
) -> Dict:
    """Measure GPU memory usage"""
    if device.type != 'cuda':
        return {'memory_MB': 0, 'memory_peak_MB': 0}

    model.to(device)
    torch.cuda.reset_peak_memory_stats(device)
    torch.cuda.empty_cache()

    x = torch.randn(*input_shape).to(device)

    with torch.no_grad():
        _ = model(x)

    current = torch.cuda.memory_allocated(device) / 1e6
    peak = torch.cuda.max_memory_allocated(device) / 1e6

    return {
        'memory_MB': current,
        'memory_peak_MB': peak,
    }


def benchmark_model(
    model_config: ModelConfig,
    dataset_type: str,
    input_shape: tuple,
    device: torch.device
) -> Dict:
    """Complete benchmark test for a single model"""
    try:
        model = create_model(model_config, dataset_type)
    except Exception as e:
        return {'error': str(e)}

    results = {
        'model_type': model_config.model_type,
        'neighborhood_size': model_config.neighborhood_size,
    }

    # Parameter count
    param_stats = count_parameters(model)
    results.update(param_stats)

    # Inference time
    try:
        inference_stats = measure_inference_time(model, input_shape, device)
        results.update(inference_stats)
    except Exception as e:
        results['inference_error'] = str(e)

    # Rollout time
    try:
        rollout_stats = measure_rollout_time(model, input_shape, device, num_steps=50)
        results.update(rollout_stats)
    except Exception as e:
        results['rollout_error'] = str(e)

    # Memory
    try:
        if device.type == 'cuda':
            mem_stats = measure_memory(model, input_shape, device)
            results.update(mem_stats)
    except Exception as e:
        results['memory_error'] = str(e)

    return results


def generate_benchmark_report(
    results: List[Dict],
    output_path: str,
    dataset_type: str
):
    """Generate benchmark test report"""
    md_content = f"""# Spinodal Decomposition Model Performance Benchmark

Dataset: {dataset_type}
Test Time: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}

Note: Spinodal decomposition demonstrates long time-step prediction capability, testing the effect of different neighborhood_size values

---

## Parameter Count Statistics

| Model | Neighborhood Size | Total Params | Trainable Params | Params (M) |
|-------|-------------------|--------------|------------------|------------|
"""
    for r in results:
        if 'error' not in r:
            md_content += f"| {r['model_type']} | {r.get('neighborhood_size', 'N/A')} | {r['total_params']:,} | {r['trainable_params']:,} | {r['total_params_M']:.2f} |\n"

    md_content += f"""
---

## Inference Speed

| Model | Neighborhood Size | Single-Step (ms) | Throughput (samples/s) | Rollout (steps/s) |
|-------|-------------------|------------------|------------------------|-------------------|
"""
    for r in results:
        if 'error' not in r and 'inference_time_ms' in r:
            inf_time = f"{r['inference_time_ms']:.2f}±{r.get('inference_time_std', 0):.2f}"
            throughput = f"{r.get('throughput_samples_per_sec', 0):.1f}"
            rollout_speed = f"{r.get('rollout_steps_per_sec', 0):.1f}"
            md_content += f"| {r['model_type']} | {r.get('neighborhood_size', 'N/A')} | {inf_time} | {throughput} | {rollout_speed} |\n"

    if any('memory_MB' in r for r in results):
        md_content += f"""
---

## GPU Memory Usage

| Model | Neighborhood Size | Current (MB) | Peak (MB) |
|-------|-------------------|--------------|-----------|
"""
        for r in results:
            if 'memory_MB' in r:
                md_content += f"| {r['model_type']} | {r.get('neighborhood_size', 'N/A')} | {r['memory_MB']:.1f} | {r['memory_peak_MB']:.1f} |\n"

    md_content += f"""
---

## Test Configuration

- Input shape: (batch=16, channels=1, H=128, W=128)
- Warmup iterations: 10
- Timing iterations: 100
- Rollout steps: 50

## Time Step Description

- ndt=1: Corresponds to 10dt time span
- ndt=10: Corresponds to 100dt time span
- ndt=100: Corresponds to 1000dt time span

Different ndt values may require different neighborhood_size for effective mass transport

"""

    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(md_content)

    print(f"Benchmark report saved to: {output_path}")


def main():
    """Spinodal decomposition model benchmark test"""
    import argparse
    parser = argparse.ArgumentParser(description='Spinodal Decomposition Model Benchmark')
    parser.add_argument('--gpu', type=int, default=0)
    args = parser.parse_args()

    device = torch.device(f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    input_shape = (16, 1, 128, 128)  # batch, channels, H, W
    dataset_type = 'spinodal_decomposition'

    # Test FluxNet-D models with different neighborhood_size
    # To demonstrate long time-step prediction capability
    configs = [
        # Small neighborhood (suitable for small time step ndt=1)
        ModelConfig(model_type='FluxNet_D', base_channels=64, num_blocks=6, kernel_size=5, neighborhood_size=5, lower_bound=0.0, upper_bound=1.0),
        # Medium neighborhood (suitable for medium time step ndt=10)
        ModelConfig(model_type='FluxNet_D', base_channels=64, num_blocks=6, kernel_size=5, neighborhood_size=9, lower_bound=0.0, upper_bound=1.0),
        # Large neighborhood (suitable for large time step ndt=100)
        ModelConfig(model_type='FluxNet_D', base_channels=64, num_blocks=6, kernel_size=5, neighborhood_size=15, lower_bound=0.0, upper_bound=1.0),
    ]

    results = []
    for config in configs:
        print(f"Testing: {config.model_type} (neighborhood={config.neighborhood_size})")
        result = benchmark_model(config, dataset_type, input_shape, device)
        results.append(result)
        if 'error' in result:
            print(f"  Error: {result['error']}")
        else:
            print(f"  Params: {result.get('total_params', 'N/A'):,}")

    output_path = "/home/ml4pf/zshlan/FluxNet/results/spinodal_decomposition/model_benchmark.md"
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    generate_benchmark_report(results, output_path, dataset_type)

    print("\nBenchmark test complete!")


if __name__ == "__main__":
    main()
