import pyspark.sql.functions as F
from pyspark.ml.feature import BucketedRandomProjectionLSH
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.sql.types import *
import numpy as np
import sys
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col
from pyspark.sql.functions import row_number

spark = SparkSession.builder.getOrCreate()

# Read in clusters and transformed data
df = spark.read.parquet(sys.argv[1])
df2 = spark.read.parquet(sys.argv[2])

# Distance UDF
def func(x, y):
    d = 0
    for i, each in enumerate(x):
        d += (x[i]-y[i])**2
    return d

distance = F.udf(lambda x, y: func(x, y))
res = df.join(df2, F.col("id") == F.col("prediction")).withColumn("dist", distance(F.col("features"), F.col("centroid")))

# Get inner datapoints
window = Window.partitionBy(res['prediction']).orderBy(res['dist'].asc())
topvals1 = res.select('*', rank().over(window).alias('rank')).filter(col('rank') <= int(sys.argv[3])/2)

# Get outer datapoints
window = Window.partitionBy(res['prediction']).orderBy(res['dist'].desc())
topvals2 = res.select('*', rank().over(window).alias('rank')).filter(col('rank') <= int(sys.argv[3])/2)

# Write out
topvals1.select(F.col("_c0"), F.col("_c1")).union(topvals2.select(F.col("_c0"), F.col("_c1"))).write.option("delimiter", "\t").csv(sys.argv[4])

