from sklearn.cluster import BisectingKMeans
import numpy as np
import os, json
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--config', type=str, default='/path/to/config', help='Config Path')
args = parser.parse_args()
config = json.load(open(args.config))
k=config["clustering_k"]
RANDOM_SEED = 23
np.random.seed(RANDOM_SEED)


if __name__ == "__main__":

    data = np.memmap(
        os.path.join(config["working_dir"], "full_dataset_embeddings.npy"),
        dtype='float32',
        mode="r+",
        shape=(config["dataset_len"], 768)
    )
    print("Data shape: ", data.shape, flush=True) 

    print("Seed = ", RANDOM_SEED, flush=True)  
    np.random.seed(RANDOM_SEED)
    
    kmeans = BisectingKMeans(n_clusters=k, n_init=3,random_state=RANDOM_SEED, verbose=1, init="random", copy_x=False, bisecting_strategy="largest_cluster")
    
    kmeans.fit(data)
    
    labels = kmeans.labels_
    
    # Save the clustering results
    np.save(os.path.join(config["working_dir"], "cluster_assignments.npy"), labels)
    np.save(os.path.join(config["working_dir"], "github_cluster_centroids.npy"), kmeans.cluster_centers_)
    
    # Print cluster distribution
    unique_clusters, counts = np.unique(labels, return_counts=True)
    print(f"Clustering completed with {k} clusters.")
    print(f"Cluster distribution: min={counts.min()}, max={counts.max()}, avg={counts.mean():.1f}")
    print(f"Loss: {kmeans.inertia_:.2f}")
    print(f"Saved cluster assignments and centroids to {config['working_dir']}")

