from typing import Dict
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, LogisticRegression, HuberRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingRegressor, HistGradientBoostingClassifier
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.base import BaseEstimator, RegressorMixin


def to_dense(X):
    return X.toarray() if hasattr(X, "toarray") else X


class OrdinalLogitRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, thresholds=(2.0, 3.0, 4.0, 5.0), C=1.0, class_weight="balanced", max_iter=1000, solver="liblinear", random_state=42):
        self.thresholds = thresholds
        self.C = C
        self.class_weight = class_weight
        self.max_iter = max_iter
        self.solver = solver
        self.random_state = random_state
        self.models_ = None

    def fit(self, X, y, sample_weight=None):
        y = np.asarray(y)
        models = []
        for thr in self.thresholds:
            y_bin = (y >= thr).astype(int)
            lr = LogisticRegression(C=self.C, class_weight=self.class_weight, max_iter=self.max_iter, solver=self.solver, random_state=self.random_state)
            try:
                lr.fit(X, y_bin, sample_weight=sample_weight)
            except TypeError:
                lr.fit(X, y_bin)
            models.append(lr)
        self.models_ = models
        return self

    def predict(self, X):
        if not self.models_:
            raise ValueError("Model not fitted")
        # Expected rating E[R] = 1 + sum_{k in thresholds} P(R >= thr_k)
        ps = []
        for lr in self.models_:
            if hasattr(lr, "predict_proba"):
                p = lr.predict_proba(X)[:, 1]
            else:
                # fallback via decision function -> sigmoid
                z = lr.decision_function(X)
                p = 1.0 / (1.0 + np.exp(-z))
            ps.append(p)
        ps = np.vstack(ps)  # shape (n_thr, n_samples)
        exp_rating = 1.0 + np.sum(ps, axis=0)
        return exp_rating


def make_models(task: str, random_state: int = 42) -> Dict[str, Pipeline]:
    if task == "regression":
        return {
            "ridge": Pipeline([
                ("vec", DictVectorizer(sparse=True)),
                ("scale", StandardScaler(with_mean=False)),
                ("est", Ridge(alpha=1.0, random_state=random_state)),
            ]),
            "huber": Pipeline([
                ("vec", DictVectorizer(sparse=True)),
                ("to_dense", FunctionTransformer(to_dense, accept_sparse=True)),
                ("scale", StandardScaler(with_mean=True)),
                ("est", HuberRegressor()),
            ]),
            "ordinal_logit": Pipeline([
                ("vec", DictVectorizer(sparse=True)),
                ("to_dense", FunctionTransformer(to_dense, accept_sparse=True)),
                ("scale", StandardScaler(with_mean=True)),
                ("est", OrdinalLogitRegressor(C=2.0, class_weight="balanced", random_state=random_state)),
            ]),
            "rf_reg": Pipeline([
                ("vec", DictVectorizer(sparse=True)),
                ("to_dense", FunctionTransformer(to_dense, accept_sparse=True)),
                ("est", RandomForestRegressor(n_estimators=400, random_state=random_state, n_jobs=-1)),
            ]),
            "hgb_reg": Pipeline([
                ("vec", DictVectorizer(sparse=True)),
                ("to_dense", FunctionTransformer(to_dense, accept_sparse=True)),
                ("est", HistGradientBoostingRegressor(random_state=random_state)),
            ]),
        }
    elif task == "classification":
        return {
            "logreg": Pipeline([
                ("vec", DictVectorizer(sparse=True)),
                ("scale", StandardScaler(with_mean=False)),
                ("est", LogisticRegression(max_iter=1000, solver="liblinear", class_weight="balanced", random_state=random_state)),
            ]),
            "logreg_l1": Pipeline([
                ("vec", DictVectorizer(sparse=True)),
                ("scale", StandardScaler(with_mean=False)),
                ("est", LogisticRegression(max_iter=1000, solver="liblinear", penalty="l1", C=1.0, class_weight="balanced", random_state=random_state)),
            ]),
            "rf_clf": Pipeline([
                ("vec", DictVectorizer(sparse=True)),
                ("to_dense", FunctionTransformer(to_dense, accept_sparse=True)),
                ("est", RandomForestClassifier(n_estimators=400, random_state=random_state, n_jobs=-1, class_weight="balanced_subsample")),
            ]),
            "hgb_clf": Pipeline([
                ("vec", DictVectorizer(sparse=True)),
                ("to_dense", FunctionTransformer(to_dense, accept_sparse=True)),
                ("est", HistGradientBoostingClassifier(random_state=random_state)),
            ]),
        }
    else:
        raise ValueError(f"Unknown task: {task}")
