import numpy as np
import tiktoken

# Use this GPU
device = 'cuda:0'

# Load the openwebtext training dataset
print('Load the openwebtext datasets.....')
train_data = np.memmap('data/openwebtext/train.bin', dtype=np.uint16, mode='r')

# Tokenizer
enc = tiktoken.get_encoding("gpt2")
str2tok = lambda s: enc.encode(s, allowed_special={"<|endoftext|>"})
tok2str = lambda l: enc.decode(l)

# Load the spectrum
spec = np.load('spec_RNA.npy')

# Print the sequences
for seq in spec:
    print(tok2str(train_data[seq[0]-seq[1]:seq[0]+1]))
    print('=====')
