from pathlib import Path
import polars as pl
import best3 as b3

import numpy as np

repeat = 3
data_dir = Path("data")
mdf = pl.read_excel(data_dir / "Dataset4JokeSet.xlsx", has_header=False)
mdf.columns = ["text"]

mdf = mdf.with_row_index('alternative').with_row_index('name').cast({'name': pl.String()})
mdf

data = [("customer", "alternative", "rating")]
ds1 = pl.read_excel(data_dir / "FINAL jester 2006-15.xls", has_header=False).to_numpy()
ds2 = pl.read_excel(
    data_dir / "[final] April 2015 to Nov 30 2019 - Transformed Jester Data - .xlsx",
    has_header=False,
).to_numpy()
for id, x in enumerate(
    np.concatenate(
        [
            np.concatenate(
                [ds1, np.full((len(ds1), ds2.shape[1] - ds1.shape[1]), 99)], axis=1
            ),
            ds2,
        ]
    )
):
    _c = x[0]

    row = []

    for alt, y in enumerate(x[1:]):
        if y == 99 or not np.isfinite(y):
            continue
        row.append((id, alt, float(y)))

    if len(row) != _c:
        print("skipping", id)
    data.extend(row)

df = pl.DataFrame({data[0][i]: [j[i] for j in data[1:]] for i in range(len(data[0]))})
for i in range(repeat):
    b3.save_dataset(Path('datasets') / "jokes" / "default" / str(i), mdf, df, np.eye(len(mdf)))
