import numpy as np
import pandas as pd
import scanpy as sc
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load the data
adata_train = sc.read("./input/cxg_immune_5k_train.h5ad")
adata_test = sc.read("./input/cxg_immune_5k_test.h5ad")

# Prepare training data
X_train = adata_train.layers["normalized"]
y_train = adata_train.obs["cell_type"].values

# Prepare test data
X_test = adata_test.layers["normalized"]

# Train the SVM classifier
svm_model = SVC(kernel="linear", probability=True)
svm_model.fit(X_train, y_train)

# Predict on test data
y_pred = svm_model.predict(X_test)

# Save predictions in adata_test
adata_test.obs["celltype_label"] = y_pred


# Evaluation function
def evaluate_predictions(adata_test):
    y_true = adata_test.obs["cell_type"].tolist()
    y_pred = adata_test.obs["celltype_label"].tolist()

    # encode jointly so mapping is consistent
    all_labels = np.unique(y_true + y_pred)
    encoder = LabelEncoder().fit(all_labels)
    y_true_enc = encoder.transform(y_true)
    y_pred_enc = encoder.transform(y_pred)

    # compute accuracy + three flavours of F1
    metrics = {
        "accuracy": accuracy_score(y_true_enc, y_pred_enc),
    }
    for avg in ["macro", "micro", "weighted"]:
        metrics[f"f1_{avg}"] = f1_score(y_true_enc, y_pred_enc, average=avg)

    return metrics


# Evaluate predictions
metrics = evaluate_predictions(adata_test)
print(metrics)

# Save predictions to submission file
submission_df = pd.DataFrame(
    {
        "cell_id": adata_test.obs.index,
        "celltype_label": adata_test.obs["celltype_label"],
    }
)
submission_df.to_csv("./working/submission.csv", index=False)
