

import numpy as np
# import pandas as pd
import scipy.special as ssp
import scipy.stats as sst
import scipy.optimize as sop
from scipy.interpolate import InterpolatedUnivariateSpline as inter
from scipy import sparse as spa
import itertools as it
import sympy as sy
from IPython.display import display, Math
from sklearn import linear_model as lm
from sklearn.metrics import mutual_info_score # https://stackoverflow.com/questions/20491028/optimal-way-to-compute-pairwise-mutual-information-using-numpy
import statsmodels.api as sm
import matplotlib.pyplot as pp
from matplotlib import cm
from subprocess import DEVNULL, STDOUT, check_call # to call shell scripts from python
import re
from tqdm.contrib.concurrent import process_map # requires ipywidgets
from collections import abc
from tqdm import tqdm
import warnings

# Timeout function if it takes too long to finish
# https://stackoverflow.com/questions/2281850/timeout-function-if-it-takes-too-long-to-finish
import errno
import os
import signal
import functools
# helper function for below
class TimeoutError(Exception):
    pass
# this function throws an error if applied to a function that takes longer than 'seconds'
# e.g. :
    # @timeout(5)
    # def long_running_function2():
    #     ...
# note that this is not thread-safe: if you're using multithreading, the signal will get caught by a random thread. For single-threaded programs though, this is the easiest solution.
# ... it may only work in main thread, raising 'ValueError: signal only works in main thread' in worker thread
def timeout_deco(seconds=10, error_message=os.strerror(errno.ETIME)):
    def decorator(func):
        def _handle_timeout(signum, frame):
            raise TimeoutError(error_message)

        @functools.wraps(func)
        def wrapper(*args, **kwargs):
            signal.signal(signal.SIGALRM, _handle_timeout)
            signal.alarm(seconds)
            try:
                result = func(*args, **kwargs)
            finally:
                signal.alarm(0)
            return result

        return wrapper

    return decorator
# same (and I think smarter) for with-environment
class timeout:
    def __init__(self, seconds=1, error_message='Timeout'):
        self.seconds = seconds
        self.error_message = error_message
    def handle_timeout(self, signum, frame):
        raise TimeoutError(self.error_message)
    def __enter__(self):
        signal.signal(signal.SIGALRM, self.handle_timeout)
        signal.alarm(self.seconds)
    def __exit__(self, type, value, traceback):
        signal.alarm(0)


# central moving average (window is smaller at boundaries such that output shape is same as input shape (so less averaging at boundaries))
# window size to left and right is wsize (2*wsize+1 is total size), x is input of arbitrary shape, last axis will be averaged
def cma(x,wsize,edge='centred'):            
    xavgd = np.empty(x.shape) ; xavgd[:] = np.nan
    if edge=='centred': # keep window centred at edges, although it means different sized windows at edges
        for ix in range(x.shape[-1]):
            if ix<x.shape[-1]-wsize:
                wstep = ix - max( 0 , ix - wsize ) 
                xavgd[...,ix] = x[...,ix-wstep:ix+wstep+1].mean(-1)
            else:
                wstep = x.shape[-1]-ix-1
                xavgd[...,ix] = x[...,ix-wstep:].mean(-1)
    if edge=='sized': # keep windows same sized, although it means windows at edges are not centred
        for ix in range(x.shape[-1]):
            if ix < x.shape[-1]/2: # in first half
                wstepl = min(ix,wsize)
                wstepr = 2*wsize - wstepl
            else: # in second half
                wstepr = min(x.shape[-1]-1-ix,wsize)
                wstepl = 2*wsize - wstepr
            xavgd[...,ix] = x[...,ix-wstepl:ix+wstepr+1].mean(-1)
    if edge=='part': # if window outside, keep only part inside, although it means windows are neither centred nor same-sized at edges
        for ix in range(x.shape[-1]):
            if ix < x.shape[-1]/2: # in first half
                wstepl = min(ix,wsize)
                wstepr = wsize
            else: # in second half
                wstepr = min(x.shape[-1]-1-ix,wsize)
                wstepl = wsize
            xavgd[...,ix] = x[...,ix-wstepl:ix+wstepr+1].mean(-1)
    if edge=='right': # use right end of window for value position (e.g. time), i.e. sliding window in from left (increasing until size 2*wstep)
        for ix in range(x.shape[-1]):
            ixl = max(0,ix-2*wsize)
            xavgd[...,ix] = x[...,ixl:ix+1].mean(-1)
    if edge=='left': # use left end of window for value position (e.g. time), i.e. start with left window end aligned, let slide out on right
        for ix in range(x.shape[-1]):
            ixr = min(x.shape[-1],ix+2*wsize)
            xavgd[...,ix] = x[...,ix:ixr+1].mean(-1)
    return xavgd


def ols_eval(X,y): # from MontgomeryPeckVining
    n, p = X.shape # assuming that X is of typical design matrix structure Nxp (as required for OLS here)
    class res:
        C = np.linalg.inv( X.T @ X )
        P = C @ X.T
        SStot = n*y.var() # total sum of squares
        westi = P @ y
        # SSres = ( y.T - westi.T @ X.T ) @ ( y - X @ westi ) # residual sum of squares, simplified below
        SSres = ( y.T - westi.T @ X.T ) @ y # residual sum of squares, eq (3.16) in MPV
        # vesti = SSres / (n - p) # OLS estimate for noise variance, is always unbiased, more convenient for the hypothesis testing
        vesti = SSres / n # MLE estimate for noise variance, biased but has a smaller mean squared error
        westi_var = vesti * C.diagonal() # variance of OLS estimates
        Rsq = (1-SSres/SStot) # R^2
        Rbarsq = 1 - (1-Rsq) * (n-1)/(n-p-1) # adjusted R^2 (should decrease with overfitting)
    return res

  

# general basis function generator, with limiting power for individual factors, and collective for terms
def Kgen(x,maxdeg_fac=3,maxdeg_term=5,pex=None,eqstr=False,prefac='maxfac',polar='none',polarix=0,known=None,knownstr=None,constterm=False,silent=False):
    if prefac=='maxfac':
        facfuc = lambda coll,maxx : np.math.factorial( maxx )
    elif prefac=='colfac':
        facfuc = lambda coll,maxx : np.math.factorial( coll )
    elif prefac=='max':
        facfuc = lambda coll,maxx : maxx
    elif prefac=='col':
        facfuc = lambda coll,maxx : coll
    elif prefac=='one':
        facfuc = lambda coll,maxx : 1
    else:
        raise Exception("Kgen: prefac not understood, must be 'maxfac', 'colfac', 'max', 'col', 'one', ...")
    Mx, N = x.shape
    if polar=='none':
        X_ = x.copy()
        feateqlist = ['*']
    elif polar=='basic':
        sinx = np.sin(x[[polarix],:])
        cosx = np.cos(x[[polarix],:])
        X_ = np.vstack( ( x[:polarix,:] , x[polarix+1:,:] , sinx , cosx ) ) # cut out angle is feature outside trigo functions if basic polar coordinates
        Mx -= 1
        pex = (  np.hstack( ( pex[0][:,:polarix] , pex[0][:,polarix+1:] ) )  ,  np.hstack( ( pex[1][:,:polarix] , pex[1][:,polarix+1:] ) )  )
        feateqlist = ['*','sin(*)','cos(*)'] # '*' is placeholder for the x-features, must not be used in knownstr etc
    elif polar=='basic_all':
        sinx = np.sin(x)
        cosx = np.cos(x)
        X_ = np.vstack( ( x , sinx , cosx ) )
        feateqlist = ['*','sin(*)','cos(*)']
    else:
        raise Exception("polar set not known, use 'none', 'basic', or 'basic_all'")
    if known is None:
        X = X_
        Mk = 0
    else:
        X = np.vstack( ( X_ , known ) )
        feateqlist += knownstr
        Mk = known.shape[0]
    MX = X.shape[0]
    K = [] # init
    pws = [] # to store all powers (as tuples)
    nex = ([],[]) # boolean True where pws matches powers given in pex
    if not pex is None:
        nex = ( np.full(len(pex[0]),np.nan) , np.full(len(pex[1]),np.nan) ) # to store indices corresponding to true terms in kernel matrix (correct order!)
    eqlist = []
    must = []
    mustnot = []
    nix = -1
    for n in it.product(range(maxdeg_fac+1),repeat=MX): # all combinations of powers up to maxdeg
        
        collpow = np.array(n).sum() # collective power of all factors in one term
        maxpow = np.array(n).max() # maximum power in one term
        if collpow<=maxdeg_term and (constterm or np.array(n).sum()>0): # only add to kernel matrix if collective power of term does not exceed specified max
            
            nix += 1 # current index
            
            ## build kernel matrix            
            nfac = facfuc( collpow , maxpow )
            K.append( ( 1/nfac * X**np.array(n)[:,None] ).prod(0) ) # raise to powers, multiply components, divide by factorial, append to K
            pws.append(n)
            
            ## check if basis function is part of groundtruth model (if known)
            if not pex is None:
                nex[0][np.where((pex[0]==n).all(1))[0]] = nix # numerator, if term n is not in pex, then nothing is stored
                if len(pex[1])>=1:
                    nex[1][np.where((pex[1]==n).all(1))[0]] = nix # same for denominator
               
            ## build latex expression for this term
            if eqstr:
                if nfac > 1:
                    pre = '1/' + str(nfac) + '*'
                else:
                    pre = ''
                eq = ''
                for ix,nn in enumerate(n):
                    featix = max(0,ix-Mx+1)
                    xix = ix % Mx + 1
                    if polar=='basic' and ix>=Mx:
                        xix = polarix # always chosen feature in normal 'basic'
                    if nn>0:
                        if nn==1:
                            eq += feateqlist[featix].replace('*','x_'+str(xix)) + ' '
                        elif nn>1:
                            eq += feateqlist[featix].replace('*','x_'+str(xix))+'^'+str(nn) + ' '
                eqlist.append(pre+eq[:-1])
    
            if Mk>=1 and (np.array(n)[-Mk]>=1).all(): # if terms are known to be part of true model and feature in this term ...
                must.append(True) # ... add True to indicate this term can be part of true model
            else:
                must.append(False) # ... otherwise False    
    
    if np.isnan(np.concatenate(nex)).any() and not silent:
        print("Warning: Given ground truth model not covered by dictionary -- nex incomplete")
    
    if pex is None:
        return np.vstack(K).T , np.array(eqlist) , np.array(must), np.array(mustnot), None , None
    else:
        return np.vstack(K).T , np.array(eqlist) , np.array(must), np.array(mustnot), (nex[0].astype(int), nex[1].astype(int)) , np.array(pws)


def cosel(sel,mat):
    ixs = np.where(sel)[0]
    sel2 = np.zeros(sel.shape)
    for ix in ixs:
        sel2[np.where(mat[ix])[0]] = max( sel[ix] , sel2[ix] )
    return sel2


def calc_mi(x, y, bins):
    c_xy = np.histogram2d(x, y, bins)[0]
    mi = mutual_info_score(None, None, contingency=c_xy)
    return mi
    
def calc_mimat(K,bins=np.nan,silent=False):
    p, N = K.shape
    if np.isnan(bins):
        bins = int(np.sqrt(N))
    mi_mat = np.empty((p,p)) ; mi_mat[:] = np.nan
    for ix1 in tqdm(range(p),desc='mutual information matrix',disable=silent):
        mi_mat[ix1,ix1] = calc_mi(K[:,ix1],K[:,ix1],bins)
        for ix2 in range(ix1+1,p):
            mi_mat[ix1,ix2] = calc_mi(K[:,ix1],K[:,ix2],bins)
            mi_mat[ix2,ix1] = mi_mat[ix1,ix2]
    return mi_mat
    
def calc_mivec(K,y,bins=np.nan):
    N, p = K.shape
    if np.isnan(bins):
        bins = int(np.sqrt(N))
    mi_vec = np.empty(p) ; mi_vec[:] = np.nan
    for ix in range(p):
        mi_vec[ix] = calc_mi(K[:,ix],y,bins)
    return mi_vec

# unify equations for better comparison, assume true model as last equation
# each mdleq entry is array of equations even if sized one (to allow ensembles), below always the first is taken
# puts true terms first in same order, leaves weigths as variables, cuts if more than nt_max terms, unifies notation
def uni_eq(mdleq,nt_max,translt={},compl=False):
    truemdl = mdleq[-1].item()
    truemdl_tex_ls = sy.latex(truemdl).split(' + ')
    truemdl_tex_no_w = [" ".join(e.split()[1:]) for e in truemdl_tex_ls] # remove weights w, as numbering can be different in learned models
    algos = np.empty(len(mdleq),dtype=object)
    ntrue = np.zeros(len(mdleq))
    nterms = np.zeros(len(mdleq))
    for ix,seleq in enumerate(mdleq):
        if isinstance( seleq[0] , str ): # if seleq_ str, than not from chs (but from, e.g., pysindy)
            seleq_tex_ = sy.latex(seleq[0]).replace('\\textasciicircum',' ^') # 
            if 'mathtt' in seleq_tex_ and 'text' in seleq_tex_: # str will be wrapped by \mathtt{} and \text{}
                seleq_tex = seleq_tex_[15:-2] + ' ' # cut that out, and add final space for better translation of variable names below
            else:
                seleq_tex = seleq_tex_ # otherwise try and leave as is (might need other cases later)
        else:
            seleq_tex = sy.latex(seleq[0])
        for old, new in translt.items(): # apply dictionary to unify notation (in case features are named differently)
            seleq_tex = seleq_tex.replace(' '+old+' ',' '+new+' ') # add spaces to ensure only whole variables get replaced
        seleq_tex_ls = seleq_tex.split(' + ') # terms as list of strings
        seleq_tex_no_w = [" ".join(e.split()[1:]) for e in seleq_tex_ls] # remove weights w, as numbering can be different in learned models
        eq = []
        for t in truemdl_tex_no_w:
            if t in seleq_tex_no_w:
                eq += [t]
                seleq_tex_no_w.remove(t)
                ntrue[ix] += 1
        nt = len(seleq_tex_no_w) # number of terms
        if nt + len(eq) > nt_max: # if more than nt_max terms in total
            seleq_tex_no_w = seleq_tex_no_w[:nt_max-len(eq)] + ["\\dots\\;({0})".format(nt+len(eq)-nt_max)]
        eq += seleq_tex_no_w
        # eq_w = [ 'w_{'+str(i+1)+'} '+e if 'x' in e else e for i,e in enumerate(eq)]
        eq_w = [ 'w_{'+str(i+1)+'} '+e if not '(' in e else e for i,e in enumerate(eq)] # to also include constant, assume I don't need to check for x
        algos[ix] = '$' + " + ".join(eq_w) + '$'
        nterms[ix] = nt #+ ntrue[ix] # better to know how many extra terms have been selected, otherwise I also need to say how many terms true model has
    if compl:
        return algos, ntrue, nterms # ntrue: how many true terms have been found, nterms: how many other terms not part of true model have been found 
        # --> if ntrue[ix]=4 and nterms[ix]=0, and the true model has been found, then this means that the exact model has been found by method ix ...
        # ... ntrue[ix] + nterms[ix] is sum of terms found. 
    else:
        return algos


def plot_latex(algos,feats,tit=None):
    
    bgcol = [23/255,23/255,23/255,0]
    
    n_lines = len(algos)
    grey = (191./255., 209./256., 212./255.)

    # creating figure and axis
    pp.figure(figsize=(6, 7),facecolor=bgcol)
    pp.axes([0.01, 0.01, 0.98, 0.90], frameon=True, facecolor=bgcol)
    pp.gca().set_xlim(0., 1.)
    pp.gca().set_ylim(0., 1.)
    pp.gca().set_xticklabels("", visible=False)
    pp.gca().set_yticklabels("", visible=False)

    # gap between lines in axes coords
    line_axesfrac = (1. / (n_lines))

    for i_line in range(n_lines):
        baseline = 1. - (i_line)*line_axesfrac
        baseline_next = baseline - line_axesfrac*1.
        title = algos[i_line] + ":"
        fill_color = ['white', grey][i_line % 2]
        pp.fill_between([0., 1.], [baseline, baseline],
                         [baseline_next, baseline_next],
                         color=fill_color, alpha=0.5)
        pp.annotate(title,
                     xy=(0.07, baseline - 0.3*line_axesfrac),
                     xycoords='data', color='black', weight='bold')
        demo = feats[i_line]
        pp.annotate(demo,
                     xy=(0.05, baseline - 0.75*line_axesfrac),
                     xycoords='data', color='black',
                     fontsize=16)
    if not tit is None:
        pp.title(tit)
    pp.show()
    
    

def export_latex(algos,feats,westis=None,nam="selected_features"):
    n_lines = len(algos)    
    filnam = nam+".tex"
    fd = open(filnam, "w")
    fd.write("\\documentclass{article}\n")
    fd.write("\\usepackage{amsmath, amssymb}\n")
    fd.write("\\begin{document}\n")
    fd.write("\\begin{enumerate}\n")
    for i in range(n_lines):
        s0 = algos[i]
        s1 = feats[i]
        s1 = re.sub(r"(?<!\\)\$", "$$", s1)
        fd.write("\\item[%s] %s\n" % (s0,s1))
        if not westis is None:
            s2 = westis[i]
            fd.write("\\item[%s] %s\n" % ('estimates',s2))
    fd.write("\\end{enumerate}\n")
    fd.write("\\end{document}\n")
    fd.close()
    check_call(['pdflatex', filnam], stdout=DEVNULL, stderr=STDOUT) # stdout to DEVNULL sends output into void (don't need it printed, only when error)


# find distribution matching data best, log-error for tail sensitivity, penalty on parameter number
def seldist(dat,ds,bins=42,dispen='npar'):
    ds = np.array(list(ds))
    p, be = np.histogram(dat,bins=bins,density=True)
    b = ( (be[1:] + be[:-1]) / 2 )
    p[p==0] = np.nan
    msle = np.full(ds.size,np.nan)
    fitpars = {} 
    for ix,d in enumerate(ds):
        dist = getattr(sst, d)
        dpars = dist.fit(dat)
        if isinstance(dispen,list): # use penalties as given
            pen = dispen[ix]
        elif dispen=='npar': # use number of parameters 
            pen = len(dpars)
        elif isinstance(dispen,str):
            pen = float(dispen)
        elif dispen is None:
            pen = 0
        else:
            raise Exception("Penalty plan for heuristic prior fitting not understood ('h_dispen')")
        msle[ix] = pen + np.nanmean( ( np.log(p) - np.log(dist.pdf(b,*list(dpars))) )**2 ) # mean-square-log-error for tail sensitivity
        fitpars[d] = dpars
    return getattr(sst, ds[msle==msle.min()][0]), fitpars[ds[msle==msle.min()][0]]

# take a distribution, parameters for that distribution, apply a new scale (e.g. to broaden it), re-centre such that mode is still the same
def broadcentred(d,dpars,newscale):
    d0 = d(*dpars) # frozen distribution from given parameters
    d1 = d(*dpars[:-1],scale=newscale) # new frozen distribution, but now broader scale                    
    m0 = sop.minimize(lambda z : -d0.pdf(z), d0.stats('m')) # determine mode of original frozen distribution (use mean as initial value for minimizer)
    m1 = sop.minimize(lambda z : -d1.pdf(z), d1.stats('m')) # determine mode of broadened frozen distribution
    newloc = dpars[-2] - ( m1.x - m0.x ) # use difference of modes to re-centre broadened prior with location parameter
    d2 = d(*dpars[:-2],loc=newloc,scale=newscale) # broadened and re-centred
    m2 = sop.minimize(lambda z : -d2.pdf(z), m0.x) # mode of that, although should be pretty much the same as m0
    return d2, m2.x

# step detection, here only for sudden decrease in criteria, 
# other idea might be this (but doesn't really work here) https://stackoverflow.com/questions/48000663/step-detection-in-one-dimensional-data
def step_detect(x,fr=0.5):
    xd = np.diff(x)
    ix = np.where(xd < fr*xd.min())[0] # current method is return indices where decrease is more than fr times max decrease
    return ix+1, xd[ix] # in sgl the point of decrease is one later than ix due to diff(), also output extent of decrease


##################################################################################################################################################################################################


class modsel:

    from sklearn.utils._testing import ignore_warnings
    from sklearn.exceptions import ConvergenceWarning

    def __init__(self, X=np.array([[np.nan]]), y=np.array([np.nan]), njobs=1, fractions=True, frac=1.0, silent=False, darkmode=False):   
        self.X = X
        self.y = y
        self.n, self.N = X.shape
        self.p = 1 # init value
        self.fractions = fractions
        self.fac_p = 1 + self.fractions
        self.K = np.full((self.N,self.fac_p*self.p),np.nan) # empty kernel matrix for methods like stdise to work on X before K generated
        self.njobs = njobs # use of marallel cpus
        self.chunks = 1000 # make that many chunks, good for many parallel tasks, maybe need more chunks for high-performance computing
        self.initdat()
        self.initrank()
        self.inittop()
        self.init_evipars()
        self.critunit = {}
        self.plotmode = {}
        self.terms = np.array(['']) # init list of terms (substituted later when candidate models are generated)
        self.exw = np.array([]) # init true weights (substituted later if ground truth known)
        self.exv = np.array([]) # init true weights also for denominator
        self.exix = (np.array([]),np.array([])) # init indices pointing to true weights (substituted later if ground truth known)
        self.exfeat = np.zeros(self.fac_p*self.p) # init empty bool vector for true features (1's put later where true features are if ground truth known)
        self.keepSM = {} # dictionary to store all ols criteria from statsmodel if one of them is computed (to avoid re-computing, since all are always computed)
        self.comat = np.zeros(self.K.shape) # initially none features are assumed to be correlated
        self.selmat = self.comat > 0.5 # ... will therefore all be False, and all features can be deactivated (call calc_comat and coselmat to use to prevent deactivation of connected features)
        self.comat_props = ('not yet calculated', -1., dict(vmin=-1., vmax=1., cmap='PuOr_r'))
        self.olscritnames = ['rsq', 'rsq-adj', 'aic', 'bic', 'f_pvalue'] # names for OLS criteria obtained from statsmodels OLD
        self.olscritunits = ['$R^2$', '$R_\\mathrm{adj}^2$', '$-\\mathrm{AIC}$', '$-\\mathrm{BIC}$', '$-\\ln P(F)$']
        self.isstdised = False # keep track if whether data has been standardised
        self.silent = silent
        if darkmode:
            pp.style.use('dark_background')
            sy.init_printing(use_latex=True,forecolor="White")    
            self.bgcol = [23/255,23/255,23/255,0]
            self.dark = True
        else:
            self.bgcol = [1,1,1,1]
            self.dark = False
        if not silent:
            print('\n')

    def initdat(self):
        self.yvar = self.y.var()
        self.SStot = self.N * self.yvar
        self.logSStot = np.log( self.SStot ) # needed for Rsq (now new formula with yssq assuming y centred to 0)
        self.yssq = self.y.T @ self.y # = N*var(y) = sum of squares of y

    def initrank(self):
        self.rank = np.array([]) # initialise empty rank
        self.ranknams = np.array([]) # start with empy array
        self.active_rank = None # no active rank
        self.active_rankno = None # no index of currently used rank 

    def inittop(self,tops=1):
        self.top = {} # create dictionary for top model listings
        self.topcrit = {} # create dictionary for top model criteria
        self.exfound_ix = {} # create dictionary for row index pointing to model that is correct model (if known)
        self.top['main'] = spa.csr_matrix((tops,self.fac_p*self.p),dtype=bool) # now sparse
        self.topcrit['main'] = np.empty(tops) ; self.topcrit['main'][:] = -np.inf # initialise for comparision of criteria
        self.active_top = 'main' # name of active top model listings, that's where top models are kept

    def bfe(self,maxdeg=3, maxdeg_term=3, ker='Kgen', prefac='one', polar='none', polarix=0, known=None,knownstr=None, constterm=False, stdise='normal', pex=None, frac=1.0, verbose=False):
        genK = globals()[ker] # getattr(chs,ker) does not work as chs is current module, globals has all of current module as dictionary
        self.K, self.terms, must, self.mustnot, nex, pws = genK( self.X , maxdeg_fac=maxdeg , maxdeg_term=maxdeg_term, pex=pex, eqstr=True, prefac=prefac, polar=polar, polarix=polarix, known=known,knownstr=knownstr,constterm=constterm, silent=self.silent )   
        self.p = self.K.shape[1] # total number of kernels
        self.must = np.tile(must,self.fac_p)
        self.feats = np.arange(self.fac_p*self.p) # list of active features (basis functions) set to all initially
        self.y_nonstand = self.y.copy() # keep non-standardised data (e.g. for fit of found model on original data)
        self.K_nonstand = self.K.copy() # keep non-standardised data
        self.stdise(verbose=verbose,method=stdise,frac=frac)
        if self.fractions:
            self.K = np.hstack( ( self.K , self.y[:,None] * self.K ) ) # expanded Kernel matrix for implicit regression
        self.initdat()
        self.initrank()
        self.inittop()
        self.exw = np.full(self.p,np.nan)
        self.exv = np.full(self.p,np.nan)
        self.exfeat = np.zeros(self.fac_p*self.p)
        self.bfepars = {'maxdeg':maxdeg, 'maxdeg_term': maxdeg_term, 'ker':ker, 'prefac':prefac, 'polar':polar, 'polarix':polarix, 'known':known, 'knownstr':knownstr, 'stdise':stdise, 'pex':pex} # keep bfe parameters to reproduce K at any time 
        if not nex is None:
            return self.terms, nex

    def telltruth(self,trueterms,w):
        if self.fractions:
            self.exix = (np.isin(self.terms, trueterms[0]).nonzero()[0], np.isin(self.terms, trueterms[1]).nonzero()[0]) # get indices of where true terms are
            if trueterms[0].size > self.exix[0].size and not self.silent:
                print("Warning, not all given true numerator terms have been found in dictionary ({0:d} out of {1:d})".format(self.exix[0].size,trueterms[0].size))
            if trueterms[1].size > self.exix[1].size and not self.silent:
                print("Warning, not all given true denominator terms have been found in dictionary ({0:d} out of {1:d})".format(self.exix[1].size,trueterms[1].size))
            self.exfeat[       self.exix[0]] = 1
            self.exfeat[self.p+self.exix[1]] = 1
            self.exw = np.zeros(self.p)
            if len(w[0]) == self.exw.size: # assume w is already in correct shape and order
                self.exw = np.array([w[0]]).flatten()
            else: 
                for ix in range(len(w[0])): # go through weights
                    self.exw[ np.where(trueterms[0][ix]==self.terms)[0] ] = w[0][ix] # save weight were true numerator term matches kernel term
            self.exv = np.zeros(self.p)
            if len(w[1]) == self.exv.size: # assume w is already in correct shape and order
                self.exv = np.array([w[1]]).flatten()
            else: 
                for ix in range(len(w[1])): # go through weights
                    self.exv[ np.where(trueterms[1][ix]==self.terms)[0] ] = w[1][ix] # same for denominator terms
        else:
            exix2 = np.array([0]) # the first term (=constant=1) must be present ...
            self.exv = np.zeros(self.p)
            self.exv[0] = 1 # ... and weight of that must be one
            self.exix = (np.isin(self.terms, trueterms[0]).nonzero()[0], exix2) # get indices of where true terms are
            if trueterms[0].size > self.exix[0].size and not self.silent:
                print("Warning, not all given true numerator terms have been found in dictionary ({0:d} out of {1:d})".format(self.exix[0].size,trueterms[1].size))
            self.exfeat[       self.exix[0]] = 1
            self.exw = np.zeros(self.p)
            if len(w[0]) == self.exw.size: # assume w is already in correct shape and order
                self.exw = np.array([w[0]]).flatten()
            else: 
                for ix in range(len(w[0])): # go through weights
                    self.exw[ np.where(trueterms[0][ix]==self.terms)[0] ] = w[0][ix] # save weight were true term matches kernel term

    def stdise(self,verbose=False,method='normal',frac=1.0):
        if not self.isstdised:
            if method in ['normal','centre','unit']: # K to standard normal distribution
                Kmeanvec = self.K.mean(0)
                Kstdvec = self.K.std(0)
                zK = np.isclose(Kstdvec,0) # index where std(K) is numerically zero (e.g. for constant term)
                Kmeanvec[zK] = 0 # mean zero to leave this K invariant (otherwise problems later with Rsq etc) 
                Kstdvec[zK] = 1  # st.d. one to leave this K invariant
                ymeanvec = self.y.mean(0)
                ystdvec = self.y.std(0)
                zy = np.isclose(ystdvec,0) # same for std(y) 
                if zy:
                    ymeanvec = 0
                    ystdvec = 1
                self.K = (self.K - Kmeanvec) / Kstdvec
                self.y = (self.y - ymeanvec) / ystdvec
            elif method=='normglob': # K to standard normal distribution, but global rescaling
                self.K = (self.K - self.K.mean()) / self.K.std()
                self.y = (self.y - self.y.mean()) / self.y.std()
                self.isstdised = True
            elif method=='centre': # just centre to zero mean, no re-scaling
                self.K = (self.K - Kmeanvec)
                self.y = (self.y - ymeanvec)
                self.isstdised = True
            elif method=='centglob': # centre to zero mean, no re-scaling, but use global mean for centring
                self.K = (self.K - self.K.mean())
                self.y = (self.y - self.y.mean())
                self.isstdised = True
            elif method=='unit': # K to unit interval, maybe better for LASSO 
                self.K = (self.K - self.K.min(0)) / (self.K.max(0) - self.K.min(0))
                self.y = (self.y - ymeanvec) / ystdvec 
                self.isstdised = True
            elif method=='none':
                pass
            self.initdat()
            if verbose: # can later write summaries into attributes than can be displayed by user, also giving option to label columns for dataframe
                print("Response variable and kernel space features is standardised:".format())
                # print(pd.DataFrame( np.hstack((self.y[:,None] , self.K)) , columns=['y',]+['k'+str(i+1) for i in range(self.fac_p*self.p)] ).describe(percentiles=()) )
                print('\n') 
        else:
            if not self.silent:
                print("Data has already been standardised. Reload data and apply basis function expansion (bfe).")

    def newrank(self,nam): # generate a new ranking, and give it a name, and make active
        if self.rank.shape == (0,):
            self.rank = np.zeros((1,self.fac_p*self.p)) # first entry in feature ranking
        else:
            self.rank = np.vstack((self.rank,np.zeros(self.fac_p*self.p))) # append a fresh row to rank table
        self.ranknams = np.append(self.ranknams,nam) # append to rank name last
        self.active_rank = nam # make active (name)
        self.active_rankno = self.ranknams.size-1 # make active (row number)

    def actrank(self,nam):
        if isinstance(nam, str):
            self.active_rank = nam # make active (name)
            self.active_rankno = np.where(self.ranknams==nam)[0][0] # make active (row number)
        elif isinstance(nam, int) or isinstance(nam, float):
            self.active_rankno = int(nam) # make active (row number)
            self.active_rank = self.ranknams[nam] # make active (name)

    def newtop(self,nam,ntop=np.nan,critunit=None,plotmode=''): # new top model listing that keeps the best ntop models
        if np.isnan(ntop):
            ntop = self.top[self.active_top].shape[0] # take number of models in active top listing
        else:
            ntop = int(np.round(ntop))
        self.top[nam] = spa.lil_matrix((ntop,self.fac_p*self.p),dtype=bool) # now sparse
        self.topcrit[nam] = np.empty(ntop) ; self.topcrit[nam][:] = -np.inf # initialise for comparision of criteria
        if not critunit is None:
            self.critunit[nam] = critunit # if models are not from computing criterion, own unit can be used
            self.plotmode[nam] = plotmode
        else:
            self.critunit[nam] = '' # to prevent later errors in case of missing unit, put empty string here as default
            self.plotmode[nam] = ''
        self.active_top = nam

    def acttop(self,nam):
        self.active_top = nam # make active (name)

    @ignore_warnings(category=ConvergenceWarning)
    def lasso(self,boots=1,bootsize=0.2,nam=None,cv=5): # make lasso selection, boots is number of bootstraps, and bootsize is proportion of data used for each bootstrap
        if isinstance(nam,str):
            if len(self.ranknams)>0 and nam in self.ranknams:
                self.actrank(nam)
            else:
                self.newrank(nam)
        nit = 1000 # maximum number of iterations of regression
        tol = 1e-4 # tolerance
        pact = self.feats.size
        if self.fractions:
            lnrsq = np.full(pact,np.nan)
            selmat = np.full((pact,pact),0.)
            for nnix,nn in enumerate(tqdm(range(pact),desc='lasso: ',disable=self.silent)):
                Kix = np.concatenate( ( np.arange(nn) , np.arange(nn+1,self.feats.size) ) )
                ynow = self.K[ : , self.feats[nn] ]
                Know = self.K[ : , self.feats[Kix] ]
                sparsereg = lm.LassoCV(fit_intercept=False,max_iter=nit,tol=tol,cv=cv,n_jobs=self.njobs)
                sparsereg.fit(Know, ynow)
                lnrsq[nnix] = -np.log( 1 - sparsereg.score(Know, ynow) )
                selmat[nnix,Kix] = sparsereg.coef_ # sum up actual values instead of selection frequencies above (more flexible feature selection)
            self.rank[self.active_rankno,self.feats] = ( lnrsq[:,None]/lnrsq.max() * np.abs( selmat ) ).sum(0)
        else:
            nboot = int(np.round(bootsize*self.N))
            cv = min(cv,nboot) # cannot have number of splits greater than number of samples        
            for i in tqdm(range(boots),desc='featsel: '+self.active_rank,leave=True,disable=self.silent):
                sparsereg = lm.LassoCV(fit_intercept=False,max_iter=nit,tol=tol,cv=cv,n_jobs=self.njobs)
                bootix = np.random.choice(np.arange(self.N),nboot,replace=False)
                sparsereg.fit(self.K[bootix,:], self.y[bootix])
                sel = np.where(sparsereg.coef_ != 0)[0]
                self.rank[self.active_rankno,sel] += 1

    @ignore_warnings(category=ConvergenceWarning)
    def lars(self,boots=1,bootsize=0.2,nam=None,cv=5): # make lasso selection, boots is number of bootstraps, and bootsize is proportion of data used for each bootstrap
        if isinstance(nam,str):
            if len(self.ranknams)>0 and nam in self.ranknams:
                self.actrank(nam)
            else:
                self.newrank(nam)
        nit = 1000 # maximum number of iterations of regression
        pact = self.feats.size
        if self.fractions:
            lnrsq = np.full(pact,np.nan)
            selmat = np.full((pact,pact),0.)
            for nnix,nn in enumerate(tqdm(range(pact),desc='lasso: ',disable=self.silent)):
                Kix = np.concatenate( ( np.arange(nn) , np.arange(nn+1,self.feats.size) ) )
                ynow = self.K[ : , self.feats[nn] ]
                Know = self.K[ : , self.feats[Kix] ]
                sparsereg = lm.LarsCV(fit_intercept=False,max_iter=nit,cv=cv,n_jobs=self.njobs)
                sparsereg.fit(Know, ynow)
                lnrsq[nnix] = -np.log( 1 - sparsereg.score(Know, ynow) )
                selmat[nnix,Kix] = sparsereg.coef_ # sum up actual values instead of selection frequencies above (more flexible feature selection)
            self.rank[self.active_rankno,self.feats] = ( lnrsq[:,None]/lnrsq.max() * np.abs( selmat ) ).sum(0)
        else:
            nboot = int(np.round(bootsize*self.N))
            cv = min(cv,nboot) # cannot have number of splits greater than number of samples        
            for i in tqdm(range(boots),desc='featsel: '+self.active_rank,leave=True,disable=self.silent):
                sparsereg = lm.LarsCV(fit_intercept=False,max_iter=nit,cv=cv,n_jobs=self.njobs)
                bootix = np.random.choice(np.arange(self.N),nboot,replace=False)
                sparsereg.fit(self.K[bootix,:], self.y[bootix])
                sel = np.where(sparsereg.coef_ != 0)[0]
                self.rank[self.active_rankno,sel] += 1

    @ignore_warnings(category=ConvergenceWarning)
    def lassolars(self,boots=1,bootsize=0.2,nam=None,cv=5): # make lasso selection, boots is number of bootstraps, and bootsize is proportion of data used for each bootstrap
        if isinstance(nam,str):
            if len(self.ranknams)>0 and nam in self.ranknams:
                self.actrank(nam)
            else:
                self.newrank(nam)
        nit = 1000 # maximum number of iterations of regression
        criterion = 'aic'
        pact = self.feats.size
        if self.fractions:
            lnrsq = np.full(pact,np.nan)
            selmat = np.full((pact,pact),0.)
            for nnix,nn in enumerate(tqdm(range(pact),desc='lasso: ',disable=self.silent)):
                Kix = np.concatenate( ( np.arange(nn) , np.arange(nn+1,self.feats.size) ) )
                ynow = self.K[ : , self.feats[nn] ]
                Know = self.K[ : , self.feats[Kix] ]
                sparsereg = lm.LassoLarsIC(criterion=criterion,fit_intercept=False,max_iter=nit,cv=cv)
                sparsereg.fit(Know, ynow)
                lnrsq[nnix] = -np.log( 1 - sparsereg.score(Know, ynow) )
                selmat[nnix,Kix] = sparsereg.coef_ # sum up actual values instead of selection frequencies above (more flexible feature selection)
            self.rank[self.active_rankno,self.feats] = ( lnrsq[:,None]/lnrsq.max() * np.abs( selmat ) ).sum(0)
        else:
            nboot = int(np.round(bootsize*self.N))
            cv = min(cv,nboot) # cannot have number of splits greater than number of samples        
            for i in tqdm(range(boots),desc='featsel: '+self.active_rank,leave=True,disable=self.silent):
                sparsereg = lm.LassoLarsIC(fit_intercept=0,max_iter=nit)
                bootix = np.random.choice(np.arange(self.N),nboot,replace=False)
                sparsereg.fit(self.K[bootix,:], self.y[bootix])
                sel = np.where(sparsereg.coef_ != 0)[0]
                self.rank[self.active_rankno,sel] += 1

    @ignore_warnings(category=ConvergenceWarning)
    def Rsq(self,nn):                
        np.seterr(invalid='ignore') # can disable all invalid value warnings, potentially dangerous though. restore with np.seterr(**errset) where errset is output of np.seterr()
        np.seterr(divide='ignore') # disable all divide-by-zero warnings (also for log)
        try:
            nn_ = np.array(nn)
            Nnum = (self.feats[nn_]<self.p).sum()
            Nden = (self.feats[nn_]>=self.p).sum()
            if self.fractions and Nnum>=1 and Nden>=1 and not ( self.feats[nn_].size==2 and (self.feats[nn_]==[0,self.p]).all() ): # at least one term in numerator and denominator are non-zero each
                if self.feats[nn_][0]>=1: # if first term is not constant, use that as response variable
                    ynow = self.K[ : , self.feats[nn_][0 ] ]
                    Know = self.K[ : , self.feats[nn_][1:] ]
                elif self.feats[nn_][0]==0: # if it is constant, use second as response, also works if that term is from the denominator
                    ynow = self.K[ : , self.feats[nn_][1 ] ]
                    Know = np.hstack( ( self.K[ : , self.feats[nn_][[0] ] ] , self.K[ : , self.feats[nn_][2:] ] ) )

                KTy = Know.T @ ynow
                rsq = KTy.T @ np.linalg.inv( Know.T @ Know ) @ KTy
                lnrsq = np.log( (ynow-ynow.mean()).T @ (ynow-ynow.mean()) - ynow.T @ ynow  +  rsq  ) # Rsq factor, i.e. in reference to a reference model, where Rsq of reference model is arbitrary and constant and can be chucked out
            elif not self.fractions:
                Know = self.K[ : , self.feats[nn_]]
                KTy = Know.T @ self.y
                rsq = KTy.T @ np.linalg.inv( Know.T @ Know ) @ KTy
                lnrsq = - np.log( 1 - rsq / self.yssq ) # simplified if y centred to zero mean
            else: # if numerator zero, return -inf (Rsq=0)
                lnrsq = - np.inf
        except: # sometimes singular matrices occur (inversion), in this case we assume model can be dropped
            lnrsq = - np.inf
        return lnrsq
    
    def swsel(self,crit='bic',nam=None,maxiter=10): # not yet programmed for fractions, 
        feats_copy = self.feats.copy() # copy list of active feature to restore after model loop below
        self.feats = np.arange(self.fac_p*self.p) # since going all features here (deactivation of features ignored), term indices will not restrict to active features (feats restored after loop with above)  
        if isinstance(nam,str):
            if len(self.ranknams)>0 and nam in self.ranknams:
                self.actrank(nam)
            else:
                self.newrank(nam)
        critsign = {'aic':-1, 'bic':-1, 'f_pvalue':-1, 'llf':1, 'rsquared':1, 'rsquared_adj':1}
        # forward selection step
        def forsel(selvec):
            sel = selvec.copy()
            critvec = np.full(self.p,fill_value=-np.inf) # current criteria values
            for i in range(self.p):
                selnow = sel.copy()
                selnow[i] = (selnow[i]+1)>=1 # make current feature True if not True already
                if crit=='evi':
                    critvec[i] = self.evi(selnow,np.nan)[0]
                else:
                    linmod = sm.OLS(self.y,self.K[:,selnow])
                    res = linmod.fit()
                    critvec[i] = critsign[crit] * getattr(res,crit)
            sel[np.argmax(critvec)] = True
            return sel
        # backward selection step
        def bacsel(selvec):
            sel = selvec.copy()
            critvec = np.full(self.p,fill_value=-np.inf) # current criteria values
            for i in range(self.p):
                selnow = sel.copy()
                selnow[i] = (selnow[i]-1)>=1 # make current feature False if not False already
                if crit=='evi':
                    critvec[i] = self.evi(selnow,np.nan)[0]
                else:
                    linmod = sm.OLS(self.y,self.K[:,selnow])
                    res = linmod.fit()
                    critvec[i] = critsign[crit] * getattr(res,crit)
            sel[np.argmax(critvec)] = False
            return sel
        # forward-backward-selection loop until nothing changes
        change = True
        selvec = np.zeros(self.p,dtype=bool)
        niter = 0
        while change and niter<maxiter:
            niter += 1
            change = False # set False here, set to True if any changes in forward or backward selection below
            change_fw = True
            while change_fw and selvec.sum()<self.p:
                selvec1 = forsel(selvec)
                if selvec1.sum() == selvec.sum():
                    change_fw = False
                else:
                    selvec = selvec1.copy()
                    change = True
            change_bw = True
            while change_bw and selvec.sum()>1:
                selvec1 = bacsel(selvec)
                if selvec1.sum() == selvec.sum():
                    change_bw = False
                else:
                    selvec = selvec1.copy()
                    change = True
        self.rank[self.active_rankno,selvec] = 1
        self.feats = feats_copy.copy() # restore list of active features for possible later use
        
    def ols(self,nn,Know=None,ynow=None):
        nn_ = np.where(np.array(nn))[0] ### ols only for final parameter fit, nn assumed to be after feats (i.e. on original feature space)
        if Know is None or ynow is None: # use non-standardised K and y, unless K and y are given explicitly
            y = self.y_nonstand.copy()
            K_ = self.K_nonstand.copy()
            if self.fractions:
                K = np.hstack( ( K_ , y[:,None] * K_ ) ) # expanded Kernel matrix for implicit regression (non-standardised are saved not expanded)
                if nn[0]>=1: # if first term is not constant, use that as response variable
                    ynow_ = K[ : , nn[0 ] ]
                    Know_ = K[ : , nn[1:] ]
                elif nn[0]==0: # if it is constant, use second as response, also works if that term is from the denominator
                    ynow_ = K[ : , nn[1 ] ]
                    Know_ = np.hstack( ( K[ : , nn[[0] ] ] , K[ : , nn[2:] ] ) )
            else:
                ynow_ = y.copy()
                Know_ = K_[ : , nn_ ]
        else: # copy from input to prevent simultaneous mannipulation of input variables outside this scope
            ynow_ = ynow.copy()
            Know_ = Know.copy()
        linmod = sm.OLS(ynow_,Know_)
        res = linmod.fit()
        return res.params
    
    def olscrits(self,nn):     
        nn_ = np.array(nn)
        if self.fractions and (self.feats[nn_]<self.p).any(): # at least one term in numerator non-zero
            if self.feats[nn_][0]>=1: # if first term is not constant, use that as response variable
                ynow = self.K[ : , self.feats[nn_][0 ] ]
                Know = self.K[ : , self.feats[nn_][1:] ]
            elif self.feats[nn_][0]==0: # if it is constant, use second as response, also works if that term is from the denominator
                ynow = self.K[ : , self.feats[nn_][1 ] ]
                Know = np.hstack( ( self.K[ : , self.feats[nn_][[0] ] ] , self.K[ : , self.feats[nn_][2:] ] ) )
            linmod = sm.OLS(ynow,Know)
            res = linmod.fit()
            return [res.rsquared, res.rsquared_adj, -res.aic, -res.bic, -np.log(res.f_pvalue)]
        elif not self.fractions:
            Know = self.K[ : , self.feats[nn] ]
            linmod = sm.OLS(self.y,Know)
            res = linmod.fit()
            return [res.rsquared, res.rsquared_adj, -res.aic, -res.bic, -np.log(res.f_pvalue)]
        else:
            return [-np.inf, -np.inf, -np.inf, -np.inf, -np.inf]
        
    def init_evipars(self):
        self.evipars = { 'eta':0., 'nu':1., 'm':1., 'v':1., 'T':2, 'dt':0.001, 'L':100, 'reps':1, 'bound':'metr', 'tol':0.1,
                        'Nix': 5, 'conf':0.95, 'ntol':2, 'maxruns':42, 'plotit':False, 'silent':True, 'usana':True, 'fixedtau':False,
                        'heur': True, 'h_norma': True, 'h_scalfac': 1, 'h_unitscal': False, 'h_dist': 'norm', 'h_dispen':'npar', 'h_rboot': 0.1, 'h_nboot': None, 'h_plot': False }
        
    def eviana1(self,nn,eta,nu,m,v,fixedtau=False): # exact moel evidence for fixed noise 
        N = self.N            
        nn_ = np.array(nn)
        if (self.feats[nn_]<self.p).any(): # at least one term in numerator non-zero (actually redundant here, already filtered in evi)
            if self.fractions:
                if self.feats[nn_][0]>=1: # if first term is not constant, use that as response variable
                    y = self.K[ : , self.feats[nn_][0 ] ]
                    K = self.K[ : , self.feats[nn_][1:] ]
                elif self.feats[nn_][0]==0: # if it is constant, use second as response, also works if that term is from the denominator
                    y = self.K[ : , self.feats[nn_][1 ] ]
                    K = np.hstack( ( self.K[ : , self.feats[nn_][[0] ] ] , self.K[ : , self.feats[nn_][2:] ] ) )
            else:
                y = self.y.copy()
                K = self.K[:,self.feats[nn_]]
            M = np.diag(nu) # precision matrix for prior
            if fixedtau: # for fixed tau (limit of zero-width gamma prior distribution for tau)
                tau = m[0]
                A = tau * K.T @ K + M
                b = tau * K.T @ y + M.T @ eta
                detM = np.linalg.det(M)
                detA = np.linalg.det(A)
                Q = b.T @ np.linalg.inv(A) @ b
                evi = 1/2 * ( np.log( tau ) - np.log( 2*np.pi ) )  +  1/(2*N) * ( np.log( detM ) - np.log( detA ) - tau * y.T @ y - eta.T @ M @ eta + Q )
            else:
                A = K.T @ K + M
                b = K.T @ y + M @ eta
                Q = b.T @ np.linalg.inv(A) @ b
                logdetM = np.linalg.slogdet(M)[1] # first returned value is sign, but here required to be positive anyway
                logdetA = np.linalg.slogdet(A)[1] # same here
                k = m/v + 1 # resulting shape parameter of gamma distribution, fixes mode to m (expect ~1 due to tau~yprec)
                th = v # use v as scale parameter (equations to get from variance are a bit messy)
                z = y.T @ y + eta.T @ M @ eta - Q
                evi = 1/(2*N) * ( logdetM - logdetA ) - 1/2*np.log(2*np.pi) - (1/2+k/N)*np.log(z/2+1/th) - k/N*np.log(th) + 1/N*ssp.gammaln(N/2+k) - 1/N*ssp.gammaln(k)
        else:
            evi = -np.inf
        return evi 
    
    def evi(self,nn,Rs):
        nn_ = np.array(nn)
        if (self.feats[nn_]<self.p).any(): # at least one term in numerator non-zero
            if self.fractions:
                if self.feats[nn_][0]>=1: # if first term is not constant, use that as response variable
                    ynow = self.K[ : , self.feats[nn_][0 ] ]
                    Know = self.K[ : , self.feats[nn_][1:] ]
                elif self.feats[nn_][0]==0: # if it is constant, use second as response, also works if that term is from the denominator
                    ynow = self.K[ : , self.feats[nn_][1 ] ]
                    Know = np.hstack( ( self.K[ : , self.feats[nn_][[0] ] ] , self.K[ : , self.feats[nn_][2:] ] ) )
            else:
                ynow = self.y.copy()
                Know = self.K[:,self.feats[nn_]]
            p = Know.shape[1] # dimension of feature space for this model (number of kernel functions in model) -- this is not self.p = the number of basis functions considered
            N = self.N
            K4me = Know.T[:,None,:] # evidence estimator expects shape (p,M,N), p features per M dependent equations (p*M total features), N data-points
            y4me = ynow[None,:] # evidence estimator expects shape (M,N)
            # prior parameters
            fixedtau = self.evipars['fixedtau']
            heur = self.evipars['heur'] # heuristic Bayes instead of fixed prior parameters below?
            h_norma = self.evipars['h_norma'] # assume normality and use known mean and variance from OLS? if not, use bootstrapping
            h_dist = self.evipars['h_dist'] # define distribution to be fitted from bootstrap data. if None, best distribution is selected.
            h_dispen = self.evipars['h_dispen'] # penalty plan for fitting prior from data, 'npar' uses number of distribution parameters, list gives penalties explicitly
            h_scalfac = self.evipars['h_scalfac'] # factor by which scale of prior is multiplied for broadening (re-centred after)
            h_unitscal = self.evipars['h_unitscal'] # if True, unify scales for each model to at least st.d. = h_scalfac
            h_rboot = self.evipars['h_rboot'] # ratio of datapoints used for each bootstrap
            h_plot = self.evipars['h_plot'] # plot fitted distributions to bootstrap data
            if self.evipars['h_nboot'] is None: # number of times bootstrapping is done
                h_nboot = int( min( 100 / h_rboot , 1e5 ) ) # if defined like this, chances are good that most of data is used (cut off at 1e5 for feasiblity)
            else:
                h_nboot = self.evipars['h_nboot']
            eta = self.evipars['eta'] * np.ones((p,1)) # mean of gaussian prior for weights w, zero because of standardisation
            nu = self.evipars['nu'] * np.ones((p,1)) # precision of gaussian prior for weights w, <1 because of standardisation
            m = self.evipars['m'] * np.ones((1,1)) # mode of gamma distribution for prior for precision tau, ~1 because of standardisation
            v = self.evipars['v'] * np.ones((1,1)) # scale parameter for gamma distribution for prior for precision tau, >1 because of standardisation
            priordist = None # if normal prior, set to None which triggers use of optimised formula for normal distribution
            lnw_priordist = None # same for gradient
            eqlatex = ''
            # empirical bayes
            distnames = np.empty(p,dtype=object)
            dpars = np.empty(p,dtype=object)
            if heur:
                if h_norma: # use known normal parameters (mean and variance) from OLS, assuming normality
                    ols_res = ols_eval(K4me[:,0,:].T,y4me[0,:])
                    eta = ols_res.westi[:,None]
                    if h_unitscal: # normalise scales to minimum=1 first, then apply h_scalfac to change that minimum
                        nu = 1/h_scalfac**2 * ( ols_res.westi_var.min() / ols_res.westi_var[:,None] ) # factor in brackets is 1 / ( variances / min(variance) )
                    else:
                        nu = 1 / ( h_scalfac**2 * ols_res.westi_var[:,None] )
                else: # determine parameters if distribution specified in h_dist from bootstrapping
                    rng_boot = np.random.default_rng()
                    ntake = int(h_rboot * N)
                    w_boot = np.full((h_nboot,p),np.nan)
                    for ix in range(h_nboot):
                        bootix = rng_boot.choice(np.arange(N),ntake,replace=False)
                        w_boot[ix,:] = ols_eval(K4me[:,0,bootix].T,y4me[0,bootix]).westi
                    # now loop through features and fit specified distribution, with plot if flag set
                    eta = np.full((p,1),np.nan)
                    nu = np.full((p,1),np.nan)
                    priordist = np.empty(p,dtype=object)
                    lnw_priordist = np.empty(p,dtype=object)
                    for ix in range(p):
                        if h_dist is None:
                            h_dist = ['norm','t','lognorm','skewnorm','powerlognorm','exponweib','invgamma','loggamma'] 
                        if isinstance(h_dist,list):
                            dist_, dpars[ix] = seldist(dat=w_boot[:,ix],ds=h_dist,dispen=h_dispen) # dist_ is distribution object without parameters
                        else:
                            dist_ = getattr(sst, h_dist) # dist_ is distribution object without parameters
                            dpars[ix] = dist_.fit(w_boot[:,ix])
                        dist0 = dist_(*dpars[ix]) # make distribution with frozen parameters from distribution object
                        if isinstance(h_scalfac,str):
                            newscale = float(h_scalfac) # fixed scale if given as string
                        else:
                            newscale = h_scalfac*dpars[ix][-1] * np.sqrt(h_rboot) # broaden scale, and also compensate that for bootstrap factor h_rboot less data has been uses
                            # above factor with h_rboot ensures that (for enough datapoints) the fitted scale is same as st.d. from normality (calculated in ols_eval())
                        distnames[ix] = dist_.name
                        priordist[ix], m2 = broadcentred(dist_,dpars[ix],newscale) # frozen prior, broadened and re-centred, also returns mode of distribution
                        # determine gradient of log-prior numerically from cubic interpolation object
                        wvec = np.linspace( m2-10*newscale , m2+10*newscale , int(1e3)).flatten() # should cover range occurring in MALA
                        ln_priordist = inter(wvec, priordist[ix].logpdf(wvec), k=3) # make spline object from log of distribution
                        lnw_priordist[ix] = ln_priordist.derivative(1) # use derivative object of that as derivative function
                        eta[ix,0] = priordist[ix].kwds['loc'] # keep location (=mean) in case readily normal distribution will be used
                        nu[ix,0] = 1/priordist[ix].kwds['scale']**2 # also scale (=st.d.)
                        if h_plot:
                            normdist_ = sst.norm
                            normpars = normdist_.fit(w_boot[:,ix])
                            normdist = normdist_(*normpars)
                            wvec = np.linspace(w_boot[:,ix].min(),w_boot[:,ix].max(),100) # weight vector for plot
                            pw, be = np.histogram(w_boot[:,ix],bins=42,density=True) # histrogram to compare with
                            wb = ( (be[1:] + be[:-1]) / 2 ) # bin centres
                            fig, ax = pp.subplots() # now plot it
                            fig.set_facecolor(self.bgcol)
                            ax.set_facecolor(self.bgcol)
                            ax.plot(wb,pw,'o',mfc='none',label='hist')
                            ax.plot(wvec,normdist.pdf(wvec),'--',color=[0.5,0.5,0.5],label='norm'+' fit')
                            ax.plot(wvec,dist0.pdf(wvec),label=dist_.name+' fit')
                            ax.text(x=0.5, y=0.03, s=eqlatex, ha='center', va='bottom', transform=ax.transAxes)
                            ax.set_xlabel('$w_{'+str(ix+1)+'}$')
                            ax.set_yscale('log')
                            ax.legend()
                            pp.show()
                    if h_unitscal: # normalise scales to minimum=1 first, then apply h_scalfac to change that minimum
                        uniscal = h_scalfac * np.sqrt( nu.min() / nu ) # inside sqrt it is 1 / ( variances / min(variance) )
                        for ix,dname in enumerate(distnames):
                            dist_ = getattr(sst, dname)
                            priordist[ix], _ = broadcentred(dist_,dpars[ix],uniscal[ix,0]) # frozen prior, scale change to unified scale and re-centred (also returns mode of distribution, not needed here)
                            # determine gradient of log-prior numerically from cubic interpolation object
                            wvec = np.linspace( m2-10*uniscal[ix,0] , m2+10*uniscal[ix,0] , int(1e3)).flatten() # should cover range occurring in MALA
                            ln_priordist = inter(wvec, priordist[ix].logpdf(wvec), k=3) # make spline object from log of distribution
                            lnw_priordist[ix] = ln_priordist.derivative(1) # use derivative object of that as derivative function
                        nu = 1/uniscal**2 # keep precision from unified scales
            if (distnames=='norm').all(): # test if all fitted distributions are normal distributions (if empty because no fits done, priordist and lnw_priordist remain None)     
                priordist = None # if normal prior, set to None which triggers use of optimised formula for normal distribution
                lnw_priordist = None # same for gradient
            # estimate evidence
            qana = self.eviana1(nn=nn_,eta=eta.flatten(),nu=nu.flatten(),m=m.flatten(),v=v.flatten(),fixedtau=fixedtau)
            return qana, np.zeros(2), Rs
        else:
            return -np.inf, np.zeros(2), Rs
        
    def modcrit(self,nt=np.nan,crit='lnrsq',presel='',ntopex=np.nan,pltprog=False): # calculate criterion crit for all models with nt terms, can set aim to find exact best top ('ntopex') if error margin known (e.g. evi)
        pact = self.feats.size # number of currently active features        
        # next we create a new listing using either all or selected features as specified above,
        # or we re-evaluate existing top model listing and cut to ntop models (if presel specifies existing listing).
        critvec = None # initiliase, to remove possible existing values from other routines
        if not presel in self.top.keys() and nt<pact: # if no existing top model listing is used, enter here to go through features (all or selected)
            # ... therefore need to automatically restrict to certain number of models here
            nam = self.active_top # name of active top model listing, new listing will be saved under that name
            ntop = self.top[nam].shape[0] # number of top models that will be saved in new listing
            nall = list(it.combinations(np.arange(pact),nt)) # all models, each row is a model, model given in terms of indices. 
            if self.must.any(): # if there are terms that must be part of model ...
                nall = np.array(nall)[ self.must[self.feats][nall].any(1) , : ] # ... only consider models where at least one known term is present
            else:
                nall = np.array(nall)
            ncombis = nall.shape[0] # number of possible feature combinations
            if ntop>ncombis:
                ntop = ncombis
            # next block goes through all combinations and computes criterion
            if crit=='lnrsq': # use a quarter of specified cores because one Rsq worker occupies 4 cores on average
                if self.njobs==1 or ncombis<10*self.njobs: # if not enough models to go through, chunking for multiprocessing will not work, can just as well go through it sequentially
                    critvec = np.full(ncombis,np.nan)
                    for nnix,nn in enumerate(tqdm(nall, desc=crit+': features('+presel+'-'+str(nt)+') -> '+nam,disable=self.silent)): ### for debugging!
                        critvec[nnix] = self.Rsq(nn)
                else:
                    critvec = np.array( process_map( self.Rsq, nall, max_workers=int(np.ceil(self.njobs/4)), total=ncombis, chunksize=int(np.ceil(ncombis/self.chunks)), tqdm_class=tqdm, desc=crit+': features('+presel+'-'+str(nt)+') -> '+nam,disable=self.silent) )
                self.critunit[nam] = '$-\\ln(1-R^2)$'
                self.plotmode[nam] = ''
            elif crit=='evi':
                # ignore some warnings, as invalid steps of stochastic algorithm cause invalid value warnings before being rejected by Metropolis step
                np.seterr(invalid='ignore') # can disable all invalid value warnings, potentially dangerous though. restore with np.seterr(**errset) where errset is output of np.seterr()
                np.seterr(divide='ignore') # disable all divide-by-zero warnings (also for log)
                # run estimator
                if self.evipars['usana']:
                    critvec = np.array( process_map( self.evi, nall, max_workers=self.njobs, total=ncombis, chunksize=int(np.ceil(ncombis/self.chunks)), tqdm_class=tqdm, desc=crit+': features('+presel+'-'+str(nt)+') -> '+nam,disable=self.silent) )
                else:
                    critvec = np.full(ncombis, np.nan)
                    for nix,n in enumerate(tqdm(nall,desc=crit+': features('+presel+') -> '+nam,disable=self.silent)):
                        critvec[nix] = self.evi(n)
                self.critunit[nam] = '$\\frac{1}{N} \, \\ln p(y|M)$'
                self.plotmode[nam] = ''
            elif crit in self.olscritnames:
                preselstr = presel + '-' + str(nt)
                if preselstr in self.keepSM:
                    critvec = self.keepSM[preselstr][crit]
                    if not self.silent:
                        print(crit+': models('+preselstr+') -> '+nam)
                else:
                    SMvec = np.array( process_map( self.olscrits, nall, max_workers=self.njobs, total=ncombis, chunksize=int(np.ceil(ncombis/self.chunks)), tqdm_class=tqdm, desc=crit+': features('+presel+'-'+str(nt)+') -> '+nam,disable=self.silent) )
                    critvec = SMvec[: , self.olscritnames.index(crit) ]
                    self.keepSM[preselstr] = {}
                    for ix,olsnam in enumerate(self.olscritnames):
                        self.keepSM[preselstr][olsnam] = SMvec[:,ix]    
                self.critunit[nam] = self.olscritunits[ self.olscritnames.index(crit) ]
                self.plotmode[nam] = ''    
            critvec[np.logical_not(np.isfinite(critvec))] = -np.inf
            rankixsort = np.argsort(critvec)[::-1] # sort accoring to new computed criterion
            self.topcrit[nam] = np.array(critvec)[rankixsort][:ntop] # save new crits into listing, but only first ntop ones
            nall_top = self.feats[ np.array(nall)[rankixsort,:][:ntop,:] ] # indices pointing to features that define top models, mapped back to original feature space
            nall_rows = np.repeat(np.arange(ntop)[:,None],nt,axis=1) # auxiliary array which will be row index for model listing matrix below
            allmods = spa.coo_matrix((np.repeat(True,ntop*nt),(nall_rows.flatten(),nall_top.flatten())),shape=(ntop,self.fac_p*self.p),dtype=bool)
            self.top[nam] = allmods.tocsr() # transform sparse coordinate matrix into sparse row matrix efficient for row slicing            
        elif presel in self.top.keys(): # do model evaluation for existing top model listing, also enter here if presel is a known rankname
            feats_copy = self.feats.copy() # copy list of active feature to restore after model loop below
            self.feats = np.arange(self.fac_p*self.p) # since going through models here, term indices will not restrict to active features (feats restored after loop with above)
            nam = self.active_top # name of active top model listing, new listing will be saved under that name
            # presel is name of top model listing we copy from
            ntop = self.top[nam].shape[0] # number of top models for this new listing
            ntop_avble = self.top[presel].shape[0] # number of available top models we copy from, may be larger than ntop
            if ntop>ntop_avble and not self.silent:
                print('More models requested in '+nam+' (='+str(ntop)+') than models available in '+presel+' selected features (='+str(ntop_avble)+').\n --> Reducing model listing '+nam+' to size '+str(ntop_avble)+'.')
                ntop = ntop_avble
            self.top[nam] = self.top[presel].copy() # copy pre-selected top models into new top model listing, presel -> active
            if np.isnan(ntopex): # if no desired number of clear-cut defined top models is given ...
                ntopex = ntop # ... use number of total top models
            # next block goes through all models under name presel and calculates new criterion
            if crit=='lnrsq':
                critvec = np.array( process_map( self.Rsq, self.top[nam].toarray(), max_workers=int(np.ceil(self.njobs/4)), total=ntop_avble, chunksize=int(np.ceil(ntop_avble/self.chunks)), tqdm_class=tqdm, desc=crit+': models('+presel+') -> '+nam,disable=self.silent) )
                self.critunit[nam] = '$-\\ln(1-R^2)$'
                self.plotmode[nam] = ''
            elif crit=='evi':
                # ignore some warnings, as invalid steps of stochastic algorithm cause invalid value warnings before being rejected by Metropolis step
                np.seterr(invalid='ignore') # can disable all invalid value warnings, potentially dangerous though. restore with np.seterr(**errset) where errset is output of np.seterr()
                np.seterr(divide='ignore') # disable all divide-by-zero warnings (also for log)
                # run estimator
                mods = self.top[nam].toarray()
                critvec = np.full(ntop_avble, np.nan)
                crit_err = np.full((2,ntop_avble), np.nan)
                Rs = np.full(ntop_avble, None, dtype=object)                
                # while loop, with max runs for abort (effectively moving while loop in esti to here I guess)
                conv = np.full(ntop_avble,False)
                for nix in tqdm(np.where(conv==False)[0],total=ntop_avble-conv.sum(),desc=crit+': models('+presel+') -> '+nam,disable=self.silent):
                    critvec[nix], crit_err[:,nix], Rs[nix] = self.evi(mods[nix],Rs[nix]) # crit_err[0] is lower error (negative valued!), crit_err[1] is upper error                    
                self.critunit[nam] = '$\\frac{1}{N} \, \\ln p(y|M)$'
                self.plotmode[nam] = ''
            elif crit in self.olscritnames:
                if presel in self.keepSM:
                    critvec = self.keepSM[presel][crit]
                    if not self.silent:
                        print(crit+': models('+presel+') -> '+nam)
                else:                   
                    if self.njobs==1:
                        SMvec = np.full((ntop_avble,5),np.nan)
                        critvec = np.full(ntop_avble,np.nan)
                        for nnix,nn in enumerate(tqdm(self.top[nam].toarray(), desc=crit+': models('+presel+') -> '+nam,disable=self.silent)): ### for debugging!
                            SMvec[nnix,:] = self.olscrits(nn)
                    else:
                        SMvec = np.array( process_map( self.olscrits, self.top[nam].toarray(), max_workers=self.njobs, total=ntop_avble, chunksize=int(np.ceil(ntop_avble/self.chunks)), tqdm_class=tqdm, desc=crit+': models('+presel+') -> '+nam,disable=self.silent) )
                    critvec = SMvec[: , self.olscritnames.index(crit) ]
                    self.keepSM[presel] = {}
                    for ix,olsnam in enumerate(self.olscritnames):
                        self.keepSM[presel][olsnam] = SMvec[:,ix]       
                self.critunit[nam] = self.olscritunits[ self.olscritnames.index(crit) ]
                self.plotmode[nam] = ''             
            critvec[np.logical_not(np.isfinite(critvec))] = -np.inf
            rankixsort = np.argsort(critvec)[::-1] # sort according to new computed criterion
            self.topcrit[nam] = np.array(critvec)[rankixsort][:ntop] # save new crits into listing, but only first ntop ones
            self.top[nam] = self.top[nam][rankixsort,:][:ntop,:] # save new top model listing, but only first ntop ones
            self.feats = feats_copy.copy() # restore list of active features for possible later use
        self.exfound_ix[nam] = np.array((self.top[nam] == self.exfeat)).all(1).nonzero()[0] # row index of exact model found in top (otherwise empty)
        
    def val_lnrsq(self,nt=np.nan,presel='all'): # shortcut for model selection using R-squared
        self.modcrit(nt=nt,crit='lnrsq',presel=presel)
        
    def val_evi(self,nt=np.nan,presel='all',ntopex=np.nan,pltprog=False): # shortcut for model selection using model evidence
        self.modcrit(nt=nt,crit='evi',presel=presel,ntopex=ntopex,pltprog=pltprog)
        
    def val_aic(self,nt=np.nan,presel='all'): # shortcut for model selection using akaike information criterion
        self.modcrit(nt=nt,crit='aic',presel=presel)
        
    def val_bic(self,nt=np.nan,presel='all'): # shortcut for model selection using bayesian information criterion
        self.modcrit(nt=nt,crit='bic',presel=presel)
        
    def val_rsq(self,nt=np.nan,presel='all'): # shortcut for model selection using rsquared value from statsmodels
        self.modcrit(nt=nt,crit='rsq',presel=presel)
        
    def val_rsqadj(self,nt=np.nan,presel='all'): # shortcut for model selection using adjusted rsquared value from statsmodels
        self.modcrit(nt=nt,crit='rsq-adj',presel=presel)
        
    def val_fpval(self,nt=np.nan,presel='all'): # shortcut for model selection using p-value of F-statistics from statsmodels
        self.modcrit(nt=nt,crit='f_pvalue',presel=presel)
        
    def comb_top(self,topnams=None,topkeeps=None,nam='main',sortit=True,ntop=None):
        if isinstance(topnams,abc.Iterable): # check if array-like type is given (which is iterable in broadest sense)
            topnams_in = np.array(list([topnams])).flatten() # makes numpy array of almost everything, even iterator ( brackets [] in case of string (otherwise list makes it list of characters))
            if np.issubdtype( topnams_in.dtype , np.number ): # if content is numeric, transfer to keys
                topnams = np.array(list(self.top.keys()))[topnams_in]
            else: # assume that if topnams is not numeric, it's the keys for the top model dictionary
                topnams = topnams_in
        elif np.isscalar(topnams): # assume it's numeric (would be true for character as well, but false for None)
            topnams = np.array(list(self.top.keys()))[-topnams:] # use last topnams many top model lisitings
        elif topnams is None: # if none is given (default)
            topnams = np.array([s for s in self.top.keys() if not (s==nam or s=='main')]) # use all but the one we want to store into or main
        else: # if none of those, throw error as not understood
            raise Exception('comb_top() did not understand topnams')
        # if topnams scalar number use that many last ones, if numeric array use with those indices, if string array use those keys, if none use all
        # if topkeeps scalar number use same number for each tops (or less if not as many available), if numeric array use those, if none use all
        # if sortit, use already available crits to sort, in that case ntop can specify how many of those to keep
        mods = np.array([self.top[s].shape[0] for s in topnams]) # number of models from each field
        if np.isscalar(topkeeps): # assume it's numeric (would be true for character as well, but false for None)
            topkeeps = np.repeat(topkeeps,topnams.size) # use same number of models from each listing
        elif topkeeps is None:
            topkeeps = mods
        else: # if none of these cases are given, we take input as given, just make a numpy array from it
            topkeeps = np.array(list(topkeeps))
        topkeeps[topkeeps>mods] = mods[topkeeps>mods] # if more models requested than exist, reduce to available number of models
        ixs = np.concatenate(([0],np.cumsum(topkeeps)))
        self.newtop(nam,ntop=topkeeps.sum())
        self.top[nam] = self.top[nam].tolil()
        for i,t in enumerate(topnams):
            self.top[nam][ixs[i]:ixs[i+1]] = self.top[t][:topkeeps[i]]
            self.topcrit[nam][ixs[i]:ixs[i+1]] = self.topcrit[t][:topkeeps[i]]
        self.top[nam] = self.top[nam].tocsr()
        self.critunit[nam] = self.critunit[t]
        self.plotmode[nam] = self.plotmode[t]
        if sortit:
            if ntop is None:
                ntop = topkeeps.sum()
            rankixsort = np.argsort(self.topcrit[nam])[::-1] # sort according to stored criterion
            self.topcrit[nam] = self.topcrit[nam][rankixsort][:ntop] # save sorted crits into listing, but only first ntop ones
            self.top[nam] = self.top[nam][rankixsort,:][:ntop,:] # save sorted top model listing, but only first ntop ones
        if not self.silent:
            mess = ""
            if topkeeps.size <= 7:
                for i in range(topkeeps.size):
                    mess += "{0:s}({1:d}) ".format(topnams[i],topkeeps[i])
            else:
                mess = "{0:d} listings ({1:d} models) ".format(topkeeps.size,topkeeps.sum())
            mess += "-> {0:s}({1:d})".format(nam,ntop)
            print(mess)
        self.exfound_ix[nam] = np.array((self.top[nam] == self.exfeat)).all(1).nonzero()[0]
        
    def mod2feat(self,mod=None,nam=None,topnum=np.nan,weight=False,step=False,step_interval=None):
        if isinstance(mod,str) and mod in self.top.keys(): # if valid mod is given, use that model listing as source for new ranking
            self.acttop(mod)
        else: # use active model listing
            mod = self.active_top
        if isinstance(nam,str): # if name for feature ranking is given ...
            if nam in self.ranknams: # ... make active if already exists
                self.actrank(nam)
            else: # ... make new if not exists
                self.newrank(nam)
        else: # if no nam for new feature ranking is given, make new with mod name
            nam = mod
            self.newrank(nam)                
        if step: # get number of models used for feature ranking from first large decrease in criterion, overwrites topnum
            if step_interval is None:
                sgl = self.topcrit[self.active_rank]
            else:
                sgl = self.topcrit[self.active_rank][:step_interval]
            step_ixs, step_vals = step_detect(sgl,fr=0.5) # steps are decreasing more than fr of max decrease
            topnum = step_ixs[0] # take first step for now (could also take max step using step_vals)
        elif np.isnan(topnum):
            topnum = self.fac_p*self.p # generic choice for topnum considering that with one more term considered than true nt, p-nt extra terms to true terms can be chosen
        topnum = min( topnum , self.top[mod].shape[0] ) # cut number of models at available number of models
        if weight:
            w = self.topcrit[nam][:topnum].copy()
            rank = w @ self.top[mod][:topnum,:] # weight with selection criterion, than sum up
        else:
            rank = self.top[mod][:topnum,:].sum(axis=0) # sum up how often selected
        self.rank[self.active_rankno,:] = rank.copy() # / np.max(rank) # add to rank, not normalised to one anymore to preserve original counts (is normalised if used or plotted)
        if not self.silent:
            print("mod2feat: {0:s} -> {1:s}".format(mod,nam))
        
    def calc_comat(self,meth='corr'):
        if meth=='corr':
            if meth=='corr':
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    self.comat = np.corrcoef(self.K.T)
                tit = 'Correlation Matrix'
            vmin = -1.
            cmap_args = dict(vmin=vmin, vmax=1., cmap='PuOr_r')
            self.comat_props = (tit, vmin, cmap_args)
        elif meth=='mi':
            comat_ = calc_mimat(self.K.T)
            self.comat = comat_ / comat_.max()
            tit = 'Mutual Information Matrix'
            vmin = 0.
            cmap_args = dict(vmin=vmin, vmax=1., cmap='Oranges')
            self.comat_props = (tit, vmin, cmap_args)
    def coselmat(self,threshold=0.5):
        self.selmat = np.abs(self.comat) > threshold
        
    def deactfeat(self,nr=2,cmin=0,featmin=10):
        ranknorm = np.nanmax(self.rank,axis=1)[:,None]
        ranknorm[ranknorm==0] = np.inf # makes ranks zero below (in case no term has been selected yet)
        nrank = self.rank / ranknorm # normalise rank of previous nr rankings
        deact = nrank[-nr:,:].mean(0)<=cmin # deactivate if accumulated selection frequency (mean() normalises to max 1) in last nt ranks is below or equal cmin
        keepact_ = np.arange(self.fac_p*self.p)[~deact] # array of features being kept
        keepact = keepact_.copy()
        for ft in keepact_:
            keepact = np.unique( np.append( keepact , np.where(self.selmat[ft])[0] ) )
        if keepact.size < featmin:
            keepact = np.argsort(nrank[-nr:,:].mean(0))[::-1][:featmin]
        Nactfeats = self.feats.size
        self.feats = np.intersect1d( self.feats , keepact ) # new active features are those that have been active before, and have now not been deactivated
        Nactfeatsnow = self.feats.size
        if not self.silent:
            print("deactfeat: {0:d} -> {1:d} active features".format(Nactfeats,Nactfeatsnow))
        
    def feat2mod(self,nam='main',nr=2,cmin=0.8,nt=None):
        ranknorm = np.nanmax(self.rank,axis=1)[:,None]
        ranknorm[ranknorm==0] = np.inf # makes ranks zero below (in case no term has been selected yet)
        nrank = self.rank / ranknorm # normalise rank of previous nr rankings
        sel = nrank[-nr:,:]>cmin # bool array for rank entries above cmin
        # if any have been above cmin in all nr rankings, and none smaller than nr (and larger 0), then features have been selected consistently
        if (sel.sum(0)==nr).any() and not np.isin(sel.sum(0),np.arange(1,nr)).any(): 
            topix = np.where(self.top[nam].toarray().sum(1)==0)[0] # find index of empto row in top model listing
            if len(topix)==1:
                self.top[nam][topix[0],sel.sum(0)==nr] = True
                if not self.silent:
                    print("feat2mod: last {0:d} feature ratings -> {1:s}".format(nr,nam))
        self.exfound_ix[nam] = np.array((self.top[nam] == self.exfeat)).all(1).nonzero()[0]
        
    def showmdl(self,mdlix=np.nan):
        if np.isnan(np.array(mdlix)).any():
            mdlix_ = np.concatenate( ( self.exix[0] , self.exix[1]+self.p ) ) # truth, if known
        else:
            mdlix_ = mdlix.copy()
        if self.fractions and (mdlix_>=self.p).any(): # non-zero denominator, open frac
            eq1 = "("
        else:
            eq1 = ''
        for i,ix in enumerate(mdlix_[mdlix_<self.p]):
            if self.terms[ix]=='':
                eq1 += 'w_' + str(i+1) + '+'
            else:
                eq1 += 'w_' + str(i+1) + '*' + self.terms[ix].replace(' ','*') + '+'
        eq = eq1[:-1].replace('^','**')        
        if self.fractions and (mdlix_>=self.p).any(): # non-zero denominator, continue with denominator, then close frac
            eq2 = ')/('
            for i,ix in enumerate(mdlix_[mdlix_>=self.p]):
                if self.terms[ix-self.p]=='':
                    eq2 += 'v_' + str(i+1) + '+'
                else:
                    eq2 += 'v_' + str(i+1) + '*' + self.terms[ix-self.p].replace(' ','*') + '+'
            eq += eq2[:-1].replace('^','**') + ')'
        if eq == '':
            eq = '0'
        return sy.parsing.sympy_parser.parse_expr(eq)
    
    def gettop(self,nam=None,maxeqs=7,print_console=False,print_plot=False,print_latex=False,Xnew=None,ynow=None):
        if not isinstance(nam,str): # if no top model list given, chose active one (usually the last one)
            nam = self.active_top
        mdlixs = self.top[nam].toarray()[:maxeqs,:] # get boolean vector flagging terms for models to be printed
        eqlist = np.empty(mdlixs.shape[0]+1,dtype=object) # initialise equation list, one extra for true model
        eqlist[0] = self.showmdl() # get true model and save as first entry in equation list
        westis_ = np.empty(mdlixs.shape[0]+1,dtype=object)
        westis_[0] = self.exw[self.exw!=0]
        foundtext = "" # init
        if print_console:
            print("\nExact model (if known):")
            display(Math(sy.latex(eqlist[0]))) # print true equations in console, does not work properly in darkmode
            print("\nTop models ({0:s}):".format(nam))
        for i,ix in enumerate(mdlixs): # get equations for top models
            eqlist[i+1] = self.showmdl(np.where(ix)[0]) 
            try:
                westis_[i+1] = self.ols(ix,Know=None,ynow=None)
            except:
                westis_[i+1] = np.full(ix.sum(),np.nan)
            if print_console: # then print found equations in console, does not work properly in darkmode
                display(Math(sy.latex(eqlist[i+1])))
        if self.fractions:
            westis = westis_
        else:
            westis = westis_
        if print_plot or print_latex: # if plot or latex requested, generate latex eqations
            eqlist_latex = np.empty(mdlixs.shape[0]+1,dtype=object)
            westis_latex = np.empty(mdlixs.shape[0]+1,dtype=object)
            eqlabel = ['exact']
            for ix,eq in enumerate(eqlist):
                if ix in self.exfound_ix[nam]:
                    foundtext = " (is ground truth!)"
                eqlist_latex[ix] = '$' + sy.latex(eq) + '$'
                if not np.isnan(westis[ix]).any():
                    westis_latex[ix] = "".join(['$w_' + str(i+1) + '=' + str(round(w*1e5)*1e-5)[:6] + '$' + ', ' for i,w in enumerate(westis[ix])])[:-2]
                else: # temporary solution when OLS returns NaN for weights:
                    westis_latex[ix] = "".join(['$w_' + str(i+1) + '=' + '$' + ', ' for i,w in enumerate(westis[ix])])[:-2]
                eqlabel.append(nam + ' rank '+str(ix+1)+foundtext)
                foundtext = ""
        if print_plot:
            plot_latex(eqlabel[:-1],eqlist_latex)
        if print_latex:
            export_latex(eqlabel[:-1],eqlist_latex,westis_latex,nam=nam)        
        return eqlist, westis
    
    def plot_featsel(self):
        if self.p < 75:
            lw = 1.7
        elif self.p < 150:
            lw = 1.1
        else:
            lw = 0.7
        if self.dark:
            cmap = cm.get_cmap('PuBuGn_r')
        else:
            cmap = cm.get_cmap('terrain_r')
        fitresmat_im = self.rank.copy()
        maxs = fitresmat_im.max(1)
        fitresmat_im[maxs>0,:] = fitresmat_im[maxs>0,:] / maxs[maxs>0,None]
        fitresmat_im[fitresmat_im==0] = np.nan
        fig, ax = pp.subplots()
        fig.set_facecolor(self.bgcol)
        ax.set_facecolor(self.bgcol)
        ax.set_yticks(np.arange(len(self.ranknams)))
        ax.set_yticklabels(self.ranknams)
        ax.set_title('feature ranking')
        im = ax.imshow(fitresmat_im,cmap=cmap,aspect='auto')
        cbar = fig.colorbar(im)        
        cbar.set_ticks(np.arange(11,step=2)/10)
        cbar.set_ticklabels(np.arange(11,step=2)/10)
        for k in range(self.fac_p):
            for i in range(self.exix[k].size):
                ax.axvline(self.exix[k][i]+k*self.p,color=[177/255,89/255,40/255],linewidth=lw)
        pp.show()
        
    def plot_modsel(self,nam=None,nt=np.inf):
        if not isinstance(nam,str):
            nam = self.active_top
        nt = min(self.top[nam].shape[0] , nt ) # nt cut at number of available models. if nt=inf, then always available number of models is chosen (default)
        if self.dark:
            cmap = cm.get_cmap('seismic')
        else:
            cmap = cm.get_cmap('seismic')
        cmap.set_bad(self.bgcol)
        fitresmat_im = np.vstack( ( 2*self.exfeat , 2*self.exfeat , self.top[nam][:nt,:].toarray() ) )
        fig, ax = pp.subplots()
        fig.set_facecolor(self.bgcol)
        ax.set_facecolor(self.bgcol)
        if self.dark:
            ax.imshow(np.ma.masked_values(fitresmat_im.T,0),cmap=cmap)
        else:
            ax.matshow(np.ma.masked_values(fitresmat_im.T,0),cmap=cmap)
        ax.set_ylabel('feature index $n$')
        crit = self.topcrit[nam][:nt]
        crit[np.logical_not(np.isfinite(crit))] = np.nan
        for ix in np.where(self.exfeat)[0]:
            if self.dark:
                ax.axhline(ix,color=[177/255,89/255,40/255],linewidth=0.6,markersize=2)
            else:
                ax.axhline(ix,color=[177/255,89/255,40/255],linewidth=1.7,markersize=7)
        ax.axes.xaxis.set_ticks([])
        ax.axes.xaxis.set_ticklabels([])
        ax.set_xlabel('model ranking ('+nam+')')
        for i in range(self.exfound_ix[nam].size):
            if self.dark:
                ax.axvline(2+self.exfound_ix[nam][i],color=[177/255,89/255,40/255],linewidth=0.7)
            else:
                ax.axvline(2+self.exfound_ix[nam][i],color=[177/255,11/255,11/255],linewidth=1.4)
        if np.allclose(np.diff(crit[~np.isnan(crit)]),0):
            ax.set_title("{0:s} $\\equiv$ {1:.7f}".format(self.critunit[nam],crit[0]),fontsize=9)
        else:
            yl = np.array([self.fac_p*self.p,0])-0.5
            mod2Rsq = lambda x : yl[0] - (x - np.nanmin(crit)) / np.abs(np.nanmax(crit) - np.nanmin(crit)) * np.abs(yl[1]-yl[0])
            if self.dark:
                ax.plot(np.arange(2,nt+2),mod2Rsq(crit),'.-',color=[211/255,121/255,221/255,0.7],linewidth=0.4,markersize=2)
            else:
                ax.plot(np.arange(2,nt+2),mod2Rsq(crit),'.-',color=[11/255,101/255,22/255,0.7],linewidth=1.7,markersize=7)
            Rsq2mod = lambda x : np.nanmin(crit) - (x - yl[0]) * np.abs(np.nanmax(crit) - np.nanmin(crit)) / np.abs(yl[1]-yl[0]) 
            secax = ax.secondary_yaxis('right', functions=(Rsq2mod, mod2Rsq))
            secax.set_ylabel(self.critunit[nam])
            ax.set_ylim(yl)
        pp.show()
        
    def plot_modprop(self,nam=None):
        if not isinstance(nam,str):
            nam = self.active_top
        ntmin = self.top[nam].sum(axis=1).min()
        ntmax = self.top[nam].sum(axis=1).max()
        fig, ax = pp.subplots()
        fig.set_facecolor(self.bgcol)
        ax.set_facecolor(self.bgcol)
        if self.plotmode[nam]=='1minus':
            ax.plot( 1-self.topcrit[nam] , self.top[nam].sum(axis=1) , '-o', color=[211/255,121/255,221/255] )
            ax.set_xlabel('$1-'+self.critunit[nam]+'$')
            ax.set_ylabel('number of features')
            ax.set_yticks(np.arange(ntmin,ntmax+1))
            ax.set_yticks(np.arange(ntmin,ntmax+1))
            ax.set_xscale('log')
        else:
            ax.plot( self.topcrit[nam] , self.top[nam].sum(axis=1) , '-o', color=[211/255,121/255,221/255] )
            ax.set_xlabel(self.critunit[nam])
            ax.set_ylabel('number of features')
            ax.set_yticks(np.arange(ntmin,ntmax+1))
            ax.set_yticks(np.arange(ntmin,ntmax+1))
        for i in range(self.exfound_ix[nam].size):
            ax.axvline(self.topcrit[nam][self.exfound_ix[nam][i]],color=[177/255,89/255,40/255],linewidth=1.1)
        pp.show()
        
    def plot_comat(self):
        tit, vmin, cmap_args = self.comat_props
        fig, ax = pp.subplots()
        fig.set_facecolor(self.bgcol)
        ax.set_facecolor(self.bgcol)
        im = ax.imshow(self.comat,**cmap_args)
        cbar = fig.colorbar(im)
        cbar.set_ticks(np.linspace(vmin, 1.0, 9))
        cbar.set_ticklabels(np.linspace(vmin, 1.0, 9))
        cbar.set_label('$\\rho$',rotation=0,fontsize=14)
        ax.set_title(tit)
        ax.set_xlim([0+0.5,self.fac_p*self.p-0.5])
        ax.set_ylim([self.fac_p*self.p-0.5,0+0.5])
        pp.show()
        

