package config

import (
	"os"
	"strconv"
)

// SWEBenchRepos contains repository names to filter out (SWE-Bench benchmark repos)
var SWEBenchRepos = map[string]bool{
	"astropy/astropy":           true,
	"django/django":             true,
	"matplotlib/matplotlib":     true,
	"mwaskom/seaborn":           true,
	"pallets/flask":             true,
	"psf/requests":              true,
	"pydata/xarray":             true,
	"pylint-dev/pylint":         true,
	"pytest-dev/pytest":         true,
	"scikit-learn/scikit-learn": true,
	"sphinx-doc/sphinx":         true,
	"sympy/sympy":               true,
}

// Config holds all configuration for the pipeline
type Config struct {
	// Network settings
	ProxyBaseURL   string
	MaxConcurrency int
	MaxIdleConns   int

	// Batch settings
	BatchSize     int
	FlushInterval int // seconds

	// Storage settings
	DataDir             string
	CheckpointDir       string
	RawIndexDir         string
	FilteredReposDir    string
	RawPRsDir           string
	EnrichedPRsDir      string
	RenderedTextDir     string
	TokenizedDatasetDir string
	TokenStatsDir       string

	// File rotation
	MaxFileSize int64 // bytes

	// Checkpoint settings
	CheckpointInterval int // batches

	// Task 1 settings
	SinceID int64

	// Filtering criteria
	MinStars       int
	TargetLanguage string
	MaxPyFiles     int
	MinPyFiles     int
	MaxTotalFiles  int

	// Task 3 settings
	PRBatchSize int // Number of PRs to process per batch

	// Task 4 & 5 settings (offline, CPU-intensive)
	OfflineConcurrency int    // Concurrency for offline tasks (default 32)
	TokenizerModel     string // Tokenizer model name (deprecated, kept for compatibility)

	// Task 5 settings
	MaxTokens               int    // Maximum tokens per PR (default 32000)
	RustTokenizerPath       string // Path to Rust tokenizer worker binary
	RustWorkers             int    // Number of Rust worker threads (default 128)
	RustThreadsPerTokenizer int    // Threads per tokenizer group (default 16)
	MaxMessageSize          int    // Maximum IPC message size in bytes (default 100MB)

	// Task 6 LLM Enhancement settings
	LLMBaseURL        string // LLM API base URL (vLLM/SGLang endpoint)
	LLMAPIKey         string // LLM API key
	LLMModel          string // LLM model name
	LLMConcurrency    int    // Number of concurrent LLM requests
	LLMEnhancedPRsDir string // Output directory for LLM enhanced PRs
	LLMTimeoutSeconds int    // Timeout for LLM requests in seconds
}

// LoadConfig loads configuration from environment variables with defaults
func LoadConfig() *Config {
	cfg := &Config{
		ProxyBaseURL:       getEnv("PROXY_BASE_URL", "http://localhost:8080"),
		MaxConcurrency:     getEnvInt("MAX_CONCURRENCY", 10),
		MaxIdleConns:       getEnvInt("MAX_IDLE_CONNS", 10),
		BatchSize:          getEnvInt("BATCH_SIZE", 50000),
		FlushInterval:      getEnvInt("FLUSH_INTERVAL", 30),
		DataDir:            getEnv("DATA_DIR", "/mnt/hdd/github_data"),
		MaxFileSize:        getEnvInt64("MAX_FILE_SIZE", 500*1024*1024), // 500MB
		CheckpointInterval: getEnvInt("CHECKPOINT_INTERVAL", 1000),
		SinceID:            getEnvInt64("SINCE_ID", 0),
		MinStars:           getEnvInt("MIN_STARS", 5),
		TargetLanguage:     getEnv("TARGET_LANGUAGE", "Python"),
		MaxPyFiles:         getEnvInt("MAX_PY_FILES", 5),
		MinPyFiles:         getEnvInt("MIN_PY_FILES", 1),
		MaxTotalFiles:      getEnvInt("MAX_TOTAL_FILES", 20),
		OfflineConcurrency: getEnvInt("OFFLINE_CONCURRENCY", 32),
		TokenizerModel:     getEnv("TOKENIZER_MODEL", "Qwen/Qwen2.5-Coder-32B-Instruct"),
	}

	// Set derived paths
	cfg.CheckpointDir = cfg.DataDir + "/checkpoints"
	cfg.RawIndexDir = cfg.DataDir + "/raw_index"
	cfg.FilteredReposDir = cfg.DataDir + "/filtered_repos"
	cfg.RawPRsDir = cfg.DataDir + "/raw_prs"
	cfg.EnrichedPRsDir = cfg.DataDir + "/enriched_prs"
	cfg.RenderedTextDir = cfg.DataDir + "/rendered_text"
	cfg.TokenizedDatasetDir = cfg.DataDir + "/tokenized_dataset"
	cfg.TokenStatsDir = cfg.DataDir + "/token_stats"
	cfg.PRBatchSize = getEnvInt("PR_BATCH_SIZE", 1000)
	cfg.MaxTokens = getEnvInt("MAX_TOKENS", 32000)
	cfg.RustTokenizerPath = getEnv("RUST_TOKENIZER_PATH", "./tokenizer/rust_worker/target/release/tokenizer_worker")
	cfg.RustWorkers = getEnvInt("RUST_WORKERS", 128)
	cfg.RustThreadsPerTokenizer = getEnvInt("RUST_THREADS_PER_TOKENIZER", 16)
	cfg.MaxMessageSize = getEnvInt("MAX_MESSAGE_SIZE", 100*1024*1024) // 100MB default

	// Task 6 LLM Enhancement settings
	cfg.LLMBaseURL = getEnv("LLM_BASE_URL", "http://localhost:8000")
	cfg.LLMAPIKey = getEnv("LLM_API_KEY", "")
	cfg.LLMModel = getEnv("LLM_MODEL", "Qwen/Qwen2.5-Coder-32B-Instruct")
	cfg.LLMConcurrency = getEnvInt("LLM_CONCURRENCY", 64)
	cfg.LLMEnhancedPRsDir = cfg.DataDir + "/llm_enhanced_prs"
	cfg.LLMTimeoutSeconds = getEnvInt("LLM_TIMEOUT_SECONDS", 120)

	return cfg
}

func getEnv(key, defaultValue string) string {
	if value := os.Getenv(key); value != "" {
		return value
	}
	return defaultValue
}

func getEnvInt(key string, defaultValue int) int {
	if value := os.Getenv(key); value != "" {
		if intVal, err := strconv.Atoi(value); err == nil {
			return intVal
		}
	}
	return defaultValue
}

func getEnvInt64(key string, defaultValue int64) int64 {
	if value := os.Getenv(key); value != "" {
		if intVal, err := strconv.ParseInt(value, 10, 64); err == nil {
			return intVal
		}
	}
	return defaultValue
}
