# Computes and saves in file the size of the Rset for different datasets
import numpy as np
import pandas as pd
import time
import pathlib
from treefarms import TREEFARMS
import pickle

def run_tree_farms(df, rashomon_parameter, depth):
    X, y = df.iloc[:, :-1], df.iloc[:, -1]
    h = df.columns[:-1]
    # train TREEFARMS model
    config = {
        "regularization": 0.01,  
        "rashomon_bound_adder": rashomon_parameter,  
        "rashomon_bound_multiplier": 0,
        "depth_budget": depth,
        "verbose": False
    }

    model = TREEFARMS(config)
    model.fit(X, y)
    return model.get_tree_count()


# datasets for which we compute the Rset
files = ['../datasets/car_evaluation.csv'
        , '../datasets/monks2.csv'
        , '../datasets/monks1.csv'
        , '../datasets/monks3.csv'
        , '../datasets/bar7.csv'
        , '../datasets/compas.csv'
        , '../datasets/SPECT.csv'
        , '../datasets/fico.csv'
        , '../datasets/bcw_bin.csv'
        , '../datasets/restaurant_20_50.csv'
        , '../datasets/carryout_takeaway.csv'
        , '../datasets/restaurant_20.csv'
        , '../datasets/bar.csv'
        , '../datasets/coffee_house.csv']

result = {}
theta = 0.05

# go over different tree depth and save results
for file_id, file in enumerate(files):

    df = pd.read_csv(file)
    res = []
    for i in range(1,8):
        num_model = run_tree_farms(df, theta, i)
        res += [num_model]
    result[file] = res
    

# open a file to store the data
file = open('tree_farms_sets', 'wb')

# dump information to that file
pickle.dump(result, file)
file.close()