#!/usr/bin/env julia

"""
Synthetic Data Generation for CEED Framework

This script generates synthetic experimental data for the CEED (Cost-Efficient Experimental Design) framework.
The data is used for both Value Iteration analysis and CEED experimental design optimization.

Generated Files:
- historical_data.csv: Historical experimental results with feature measurements and target values
- feature_stats.csv: Statistical summary of features (mean, std, min, max) 
- target_coefficients.csv: Linear coefficients relating features to target values
- initial_state.csv: Initial experimental state with features 1 and 2 measured

This ensures consistent synthetic data across all CEED analyses.
"""

using DataFrames, CSV, Distributions, Random, Statistics

"""
Generate synthetic historical dataset for CEED analysis.

# Arguments
- `n_compounds::Int`: Number of historical compounds (default: 200)
- `n_features::Int`: Number of features (default: 6)
- `seed::Int`: Random seed for reproducibility (default: 42)
- `add_noise::Bool`: Whether to add noise to target values (default: false)
- `use_random_coeffs::Bool`: Use random coefficients instead of fixed ones (default: true)

# Returns
- Tuple of (historical_data, target_coefficients, feature_stats)
"""
function generate_synthetic_data(;
    n_compounds::Int = 200,
    n_features::Int = 6,
    seed::Int = 42,
    add_noise::Bool = false,
    use_random_coeffs::Bool = true
)
    # Set seed for reproducible data generation
    Random.seed!(seed)
    
    println("Generating synthetic data with parameters:")
    println("  - Compounds: $n_compounds")
    println("  - Features: $n_features") 
    println("  - Seed: $seed")
    println("  - Add noise: $add_noise")
    println("  - Random coefficients: $use_random_coeffs")
    
    # Create feature distributions
    feature_distributions = Vector{Distribution}(undef, n_features)
    for i in 1:n_features
        # Each feature has different mean and variance
        mean_val = 8.0 + (i * 8.0)  # Features range from ~8 to ~56
        std_val = mean_val * 0.3     # 30% coefficient of variation
        feature_distributions[i] = truncated(Normal(mean_val, std_val), 0.0, mean_val * 2)
    end
    
    # Generate target coefficients
    if use_random_coeffs
        # Generate random coefficients and normalize
        coeff_seed = Int(round(time() * 1000)) % 10000
        Random.seed!(coeff_seed)
        raw_coeffs = rand(n_features)
        target_coefficients = raw_coeffs / sum(raw_coeffs)
        println("Using random coefficients (seed: $coeff_seed):")
        Random.seed!(seed)  # Reset to original seed
    else
        # Use fixed coefficients (decreasing importance)
        target_coefficients = [0.3, 0.25, 0.2, 0.15, 0.07, 0.03]
        println("Using fixed coefficients:")
    end
    
    for (i, coeff) in enumerate(target_coefficients)
        println("  Feature $i: $(round(coeff, digits=4))")
    end
    
    # Generate synthetic data
    data = zeros(n_compounds, n_features + 1)  # +1 for target
    
    for i in 1:n_compounds
        # Generate feature values
        for j in 1:n_features
            data[i, j] = rand(feature_distributions[j])
        end
        
        # Calculate target value as linear combination
        target_value = sum(target_coefficients .* data[i, 1:n_features])
        
        # Add noise if requested
        if add_noise
            noise = rand(Normal(0.0, target_value * 0.1))  # 10% noise
            target_value += noise
        end
        
        data[i, n_features + 1] = target_value
    end
    
    # Create DataFrame
    feature_cols = ["feature_$j" for j in 1:n_features]
    historical_data = DataFrame(data, [feature_cols; "target"])
    
    # Calculate feature statistics
    feature_stats = DataFrame(
        feature = 1:n_features,
        mean = [mean(historical_data[:, i]) for i in 1:n_features],
        std = [std(historical_data[:, i]) for i in 1:n_features],
        min = [minimum(historical_data[:, i]) for i in 1:n_features],
        max = [maximum(historical_data[:, i]) for i in 1:n_features]
    )
    
    return historical_data, target_coefficients, feature_stats
end

"""
Generate initial experimental state with features 1 and 2 measured.
"""
function generate_initial_state(n_features::Int = 6, seed::Int = 42)
    Random.seed!(seed)
    
    # Generate values for first two features
    feature_1_value = rand(Uniform(5.0, 15.0))   # Feature 1 range
    feature_2_value = rand(Uniform(13.0, 23.0))  # Feature 2 range
    
    initial_state_df = DataFrame(
        feature = [1, 2],
        value = [feature_1_value, feature_2_value],
        n_features = [n_features, n_features]
    )
    
    return initial_state_df
end

"""
Save all synthetic data to CSV files in current directory.
"""
function save_synthetic_data(historical_data, target_coefficients, feature_stats, initial_state_df)
    # Save historical data
    CSV.write("historical_data.csv", historical_data)
    println("✓ Saved historical_data.csv ($(nrow(historical_data)) compounds)")
    
    # Save target coefficients
    coeff_df = DataFrame(
        feature = 1:length(target_coefficients),
        coefficient = target_coefficients
    )
    CSV.write("target_coefficients.csv", coeff_df)
    println("✓ Saved target_coefficients.csv ($(length(target_coefficients)) coefficients)")
    
    # Save feature statistics  
    CSV.write("feature_stats.csv", feature_stats)
    println("✓ Saved feature_stats.csv ($(nrow(feature_stats)) features)")
    
    # Save initial state
    CSV.write("initial_state.csv", initial_state_df)
    println("✓ Saved initial_state.csv ($(nrow(initial_state_df)) measured features)")
end

"""
Main function to generate and save all synthetic data.
"""
function main()
    println("="^60)
    println("CEED FRAMEWORK - SYNTHETIC DATA GENERATION")
    println("="^60)
    
    # Generate all data
    historical_data, target_coefficients, feature_stats = generate_synthetic_data()
    initial_state_df = generate_initial_state()
    
    # Save all data
    println("\nSaving synthetic data files...")
    save_synthetic_data(historical_data, target_coefficients, feature_stats, initial_state_df)
    
    println("\n" * "="^60)
    println("✅ SYNTHETIC DATA GENERATION COMPLETE!")
    println("="^60)
    println("\nGenerated files:")
    println("  - historical_data.csv: $(nrow(historical_data)) compounds × $(ncol(historical_data)) columns")
    println("  - feature_stats.csv: $(nrow(feature_stats)) features with statistics")
    println("  - target_coefficients.csv: $(length(target_coefficients)) feature coefficients")
    println("  - initial_state.csv: $(nrow(initial_state_df)) measured features")
    
    println("\nData summary:")
    println("  - Target range: [$(round(minimum(historical_data.target), digits=2)), $(round(maximum(historical_data.target), digits=2))]")
    println("  - Feature ranges:")
    for i in 1:nrow(feature_stats)
        println("    Feature $i: [$(round(feature_stats[i, :min], digits=2)), $(round(feature_stats[i, :max], digits=2))]")
    end
end

# Run main function if script is executed directly
if abspath(PROGRAM_FILE) == @__FILE__
    main()
end 