import numpy as np
import pandas as pd
import scanpy as sc
import umap
from sklearn.neighbors import NearestNeighbors

# Load the datasets
adata_mod1 = sc.read("./input/scicar_mouse_kidney_mod1_subset.h5ad")
adata_mod2 = sc.read("./input/scicar_mouse_kidney_mod2_subset.h5ad")

# Normalize and prepare the data
X_mod1 = adata_mod1.layers["normalized"]
X_mod2 = adata_mod2.layers["normalized"]

# Dimensionality reduction using UMAP
umap_model = umap.UMAP(n_components=50, random_state=42)
X_integrated_mod1 = umap_model.fit_transform(X_mod1)
X_integrated_mod2 = umap_model.fit_transform(X_mod2)

# Store the integrated representations
adata_mod1.obsm["X_integrated"] = X_integrated_mod1
adata_mod2.obsm["X_integrated"] = X_integrated_mod2


# Define the evaluation function
def compute_knn_auc(adata_mod1, adata_mod2, k=30):
    X_mod1 = adata_mod1.obsm["X_integrated"]
    X_mod2 = adata_mod2.obsm["X_integrated"]

    n_cells = X_mod1.shape[0]
    knn_mod1 = NearestNeighbors(n_neighbors=k + 1).fit(X_mod1)
    knn_mod2 = NearestNeighbors(n_neighbors=k + 1).fit(X_mod2)

    neighbors1 = knn_mod1.kneighbors(return_distance=False)[:, 1:]
    neighbors2 = knn_mod2.kneighbors(return_distance=False)[:, 1:]

    overlaps = [
        len(set(neighbors1[i]) & set(neighbors2[i])) / k for i in range(n_cells)
    ]
    return np.mean(overlaps)


# Evaluate the integration
auc_score = compute_knn_auc(adata_mod1, adata_mod2)
print(f"KNN AUC Score: {auc_score}")

# Save predictions
submission_df = pd.DataFrame({"KNN_AUC_Score": [auc_score]})
submission_df.to_csv("./working/submission.csv", index=False)
