

"""
Compute length-based percentiles of a large JSONL file (per-line JSON strings).

- Length = number of characters excluding line endings (handles \n and \r\n).
- By default computes percentiles at 5,10,...,100 and the max.
- Default percentile definition is "nearest-rank".
"""

import argparse 
import io 
import math 
import itertools 
from typing import List ,Dict ,Iterable ,Optional 

def parse_percents (s :str )->List [int ]:
    ps =[]
    for part in s .split (","):
        part =part .strip ()
        if part :
            v =int (part )
            if not (0 <=v <=100 ):
                raise argparse .ArgumentTypeError (f"Percent out of range: {v }")
            ps .append (v )
    ps =sorted (set (ps ))
    if not ps :
        raise argparse .ArgumentTypeError ("Empty --percents.")
    return ps 

def nearest_rank_percentile (sorted_vals :List [int ],p :int )->int :
    """
    Nearest-rank definition:
    rank r = ceil(p/100 * n), with 1 <= r <= n. 0%→min, 100%→max
    """
    n =len (sorted_vals )
    if p <=0 :
        return sorted_vals [0 ]
    if p >=100 :
        return sorted_vals [-1 ]
    r =max (1 ,min (n ,math .ceil (p *n /100.0 )))
    return sorted_vals [r -1 ]

def linear_percentile (sorted_vals :List [int ],p :int )->float :
    """
    Linear interpolation between closest ranks (like numpy.percentile(method='linear')).
    """
    n =len (sorted_vals )
    if n ==1 :
        return float (sorted_vals [0 ])
    if p <=0 :
        return float (sorted_vals [0 ])
    if p >=100 :
        return float (sorted_vals [-1 ])

    pos =(p /100.0 )*(n -1 )
    lo =int (math .floor (pos ))
    hi =int (math .ceil (pos ))
    if lo ==hi :
        return float (sorted_vals [lo ])
    frac =pos -lo 
    return sorted_vals [lo ]*(1.0 -frac )+sorted_vals [hi ]*frac 


class P2Quantiles :
    """
    Multi-quantile P^2 estimator for percentiles in percents (0..100).
    Keeps constant memory per requested percentile.
    Good when we cannot store all values; returns approximate results.
    """
    def __init__ (self ,percents :List [int ]):

        self .qs =[max (0.0 ,min (1.0 ,p /100.0 ))for p in percents ]

        self .estimators =[self ._P2 (q )for q in self .qs ]
        self .count =0 
        self .min_seen :Optional [int ]=None 
        self .max_seen :Optional [int ]=None 

    class _P2 :

        def __init__ (self ,q :float ):
            self .q =q 
            self .n =0 
            self .x =[]

            self .qi =[0.0 ]*5 
            self .ni =[0 ]*5 
            self .di =[0.0 ]*5 

        def add (self ,v :int ):
            if self .n <5 :
                self .x .append (v )
                self .n +=1 
                if self .n ==5 :
                    self .x .sort ()
                    self .qi =list (map (float ,self .x ))
                    self .ni =[1 ,2 ,3 ,4 ,5 ]
                    self .di =[0.0 ,self .q /2 ,self .q ,(1 +self .q )/2 ,1.0 ]
                return 


            k =0 
            if v <self .qi [0 ]:
                self .qi [0 ]=float (v )
                k =0 
            elif v >=self .qi [4 ]:
                self .qi [4 ]=float (v )
                k =3 
            else :
                while k <4 and v >=self .qi [k +1 ]:
                    k +=1 

            for i in range (k +1 ,5 ):
                self .ni [i ]+=1 
            for i in range (5 ):
                self .ni [i ]

            self .di =[self .di [0 ]+0 ,
            self .di [1 ]+self .q /2 ,
            self .di [2 ]+self .q ,
            self .di [3 ]+(1 +self .q )/2 ,
            self .di [4 ]+1.0 ]


            for i in range (1 ,4 ):
                d =self .di [i ]-self .ni [i ]
                if (d >=1 and self .ni [i +1 ]-self .ni [i ]>1 )or (d <=-1 and self .ni [i -1 ]-self .ni [i ]<-1 ):
                    dsign =1 if d >=1 else -1 

                    a =(self .qi [i +1 ]-self .qi [i ])/(self .ni [i +1 ]-self .ni [i ])
                    b =(self .qi [i ]-self .qi [i -1 ])/(self .ni [i ]-self .ni [i -1 ])
                    qip =self .qi [i ]+dsign *((self .ni [i ]-self .ni [i -1 ]+dsign )*a +(self .ni [i +1 ]-self .ni [i ]-dsign )*b )/(self .ni [i +1 ]-self .ni [i -1 ])
                    if self .qi [i -1 ]<qip <self .qi [i +1 ]:
                        self .qi [i ]=qip 
                    else :

                        self .qi [i ]=self .qi [i ]+dsign *(self .qi [i +dsign ]-self .qi [i ])/(self .ni [i +dsign ]-self .ni [i ])
                    self .ni [i ]+=dsign 

        def value (self )->Optional [float ]:
            if self .n <5 :
                if self .n ==0 :
                    return None 
                xs =sorted (self .x )

                idx =int (round (self .q *(len (xs )-1 )))
                return float (xs [idx ])
            return float (self .qi [2 ])

    def add (self ,v :int ):
        self .count +=1 
        if self .min_seen is None or v <self .min_seen :
            self .min_seen =v 
        if self .max_seen is None or v >self .max_seen :
            self .max_seen =v 
        for est in self .estimators :
            est .add (v )

    def results (self )->Dict [int ,float ]:
        out ={}
        for p ,est in zip ([int (q *100 )for q in self .qs ],self .estimators ):
            val =est .value ()
            if val is not None :
                out [p ]=val 
        return out 

def main ():
    ap =argparse .ArgumentParser (description ="Length-based percentiles of JSONL lines.")
    ap .add_argument ("path",help ="Path to the JSONL file")

    default_percents =",".join (map (str ,range (5 ,101 ,5 )))
    ap .add_argument ("--percents",default =default_percents ,type =parse_percents ,
    help =f"Comma-separated percents (0–100). Default: {default_percents }")
    ap .add_argument ("--encoding",default ="utf-8",help ="Text encoding (default: utf-8)")
    ap .add_argument ("--method",choices =["nearest","linear"],default ="nearest",
    help ="Percentile method: nearest (default) or linear (interpolation)")
    ap .add_argument ("--approx",choices =["off","p2"],default ="off",
    help ="Use approximate estimator (P^2) to avoid storing all lengths. Default: off")
    args =ap .parse_args ()

    lengths :List [int ]=[]
    p2 :Optional [P2Quantiles ]=None 
    if args .approx =="p2":
        p2 =P2Quantiles (args .percents )

    total =0 
    max_len =0 
    total_sum_of_lengths =0 

    with io .open (args .path ,"r",encoding =args .encoding ,newline ="")as f :
        for line in f :


            L =len (line .rstrip ("\r\n"))
            total +=1 
            if L >max_len :
                max_len =L 

            total_sum_of_lengths +=L 

            if p2 is None :
                lengths .append (L )
            else :
                p2 .add (L )

    if total ==0 :
        print ("No lines found.")
        return 

    print (f"# File: {args .path }")
    print (f"# Total lines: {total }")
    print (f"# Max length: {max_len }")
    if p2 is None and total_sum_of_lengths >0 :
        print (f"# Total length sum: {total_sum_of_lengths }")


    if p2 is None :

        lengths .sort ()

        if args .method =="nearest":
            cumulative_sums =list (itertools .accumulate (lengths ))
            print ("percent,percentile_length,cumulative_sum_perc")
            for p in args .percents :
                v =nearest_rank_percentile (lengths ,p )
                n =len (lengths )
                rank =max (1 ,min (n ,math .ceil (p *n /100.0 )))
                idx =rank -1 

                cum_sum =cumulative_sums [idx ]
                cum_perc =(cum_sum /total_sum_of_lengths *100.0 )if total_sum_of_lengths >0 else 0.0 
                print (f"{p },{v },{cum_perc :.2f}")
        else :
            print ("percent,percentile_length")
            print ("# Cumulative sum percentage is only shown for 'nearest' method.")
            for p in args .percents :
                v =linear_percentile (lengths ,p )
                print (f"{p },{v }")

    else :

        res =p2 .results ()
        print ("# Approximation method: P^2 (constant memory)")
        print ("# Cumulative sum percentage is not available in approximation mode.")
        print ("percent,percentile_length_estimate")
        for p in args .percents :
            v =res .get (p ,None )
            print (f"{p },{'NA'if v is None else v }")

if __name__ =="__main__":
    main ()
