import sys, os, time, socket, yaml, logging, numpy as np
import logging.config
import scipy.sparse as sp
from tqdm import tqdm
import transformers
import scipy.sparse as sp

def to_scipy_matrix(lol_inds, num_cols):
  cols = np.concatenate(lol_inds)
  rows = np.concatenate([[i]*len(x) for i, x in enumerate(lol_inds)])
  data = np.concatenate([[1.0]*len(x) for i, x in enumerate(lol_inds)])
  return sp.coo_matrix((data, (rows, cols)), (len(lol_inds), num_cols)).tocsr()

import pandas, gzip
def process_lf_amazon_datasets(dataset):
  print('Reading raw dataset files...')
  trn_df = pandas.read_json(gzip.open(f'datasets/{dataset}/trn.json.gz'), lines=True)
  tst_df = pandas.read_json(gzip.open(f'datasets/{dataset}/tst.json.gz'), lines=True)
  lbl_df = pandas.read_json(gzip.open(f'datasets/{dataset}/lbl.json.gz'), lines=True)

  print('Processing Y (label) files...')
  trn_X_Y = to_scipy_matrix(trn_df.target_ind.values, lbl_df.shape[0]).astype(np.float32)
  tst_X_Y = to_scipy_matrix(tst_df.target_ind.values, lbl_df.shape[0]).astype(np.float32)

  sp.save_npz(f'datasets/{dataset}/Y.trn.npz', trn_X_Y)
  sp.save_npz(f'datasets/{dataset}/Y.tst.npz', tst_X_Y)

  print('Processing X (input) files...')
  print(*trn_df.title.apply(lambda x: x.strip()).values, sep='\n', file=open(f'datasets/{dataset}/X.trn.txt', 'w'))
  print(*tst_df.title.apply(lambda x: x.strip()).values, sep='\n', file=open(f'datasets/{dataset}/X.tst.txt', 'w'))
  print(*lbl_df.title.apply(lambda x: x.strip()).values, sep='\n', file=open(f'datasets/{dataset}/Y.txt', 'w'))

process_lf_amazon_datasets(sys.argv[1])
