#!/usr/bin/env python

"""Create a histogram showing the age distribution of the participants.

The freedman_diaconis function is adapted from
http://www.jtrive.com/determining-histogram-bin-width-using-the-freedman-diaconis-rule.html
"""

import os
import pandas as pd
import sqlite3
from matplotlib import pyplot as plt
import numpy as np
from scipy import stats

def freedman_diaconis_bins(data) -> int:
    """

    Use Freedman Diaconis rule to compute optimal histogram bin width.
    This function returns the number of bins.

    Parameters
    ----------
    data: One-dimensional array.

    Returns
    -------

    bins: Number of bins computed
    """
    data = np.asarray(data)
    IQR  = stats.iqr(data, rng=(25, 75), scale="raw")
    N    = data.size
    bw   = (2 * IQR) / np.cbrt(N)

    datmin, datmax = data.min(), data.max()
    datrng = datmax - datmin
    result = int((datrng / bw) + 1)

    return result

db_connection = sqlite3.connect(f"/space/{os.getenv('USER')}/tomcat/tomcat.db")

with db_connection:
    ages = [
        x[1] if x[1] is not None else "Not reported"
        for x in db_connection.execute(
        """
        SELECT id, age from participant
        """
        ).fetchall()
        if x[0] > 0
    ]

    print(f"{ages.count('Not reported')} out of {len(ages)} participants did not report their age.")
    valid_ages = [age for age in ages if age != "Not reported"]
    plt.style.use("ggplot")
    fig, axes = plt.subplots(figsize=(2,2))
    bins = freedman_diaconis_bins(valid_ages)
    axes.hist(valid_ages, bins=bins)
    plt.tight_layout()
    fig.savefig("../../../images/ages.pdf")
