import numpy as np
import pandas as pd
import anndata as ad
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
import sklearn.metrics

# Load data
adata_merged = ad.read_h5ad("./input/TMSD_LUNG_1.h5ad")


# Split into single cell and spatial data
def split_sc_and_sp(adata_merged: ad.AnnData, batch_key: str = "modality"):
    is_sp = adata_merged.obs[batch_key] == "sp"
    adata_sp = adata_merged[is_sp, :].copy()
    adata_sc = adata_merged[~is_sp, :].copy()
    return adata_sc, adata_sp


adata_sc, adata_sp = split_sc_and_sp(adata_merged)

# Prepare the data for modeling
X = adata_sc.X
y = adata_sc.obs["label"].values

# Encode labels as one-hot
cell_types = np.unique(y)
y_one_hot = pd.get_dummies(y, prefix="cell_type").values

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(
    X, y_one_hot, test_size=0.2, random_state=42
)

# Fit random forest regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict proportions for spatial data
proportions_pred = model.predict(adata_sp.X)

# Store predictions in adata_sp
adata_sp.obsm["proportions_pred"] = proportions_pred


# Evaluation
def r2(adata):
    prop_true = adata.obsm["proportions_true"]
    prop_pred = adata.obsm["proportions_pred"]
    return sklearn.metrics.r2_score(
        prop_true, prop_pred, sample_weight=None, multioutput="uniform_average"
    )


# Calculate R² score
r2_score = r2(adata_sp)
print(f"R² Score: {r2_score}")

# Save predictions for grading
submission_df = pd.DataFrame(
    proportions_pred, columns=[f"cell_type_{ct}" for ct in cell_types]
)
submission_df.to_csv("./working/submission.csv", index=False)
