import os
import numpy as np

from typing import Any, List, Union, Optional, Iterator


def best_fitting_int_dtype(
    max_int_to_represent,
) -> Union[np.uint16, np.uint32, np.int64]:
    """Borrow from Fairseq"""
    if max_int_to_represent is None:
        return np.uint32  # Safe guess
    elif max_int_to_represent < 65500:
        return np.uint16
    elif max_int_to_represent < 4294967295:
        return np.uint32
    else:
        return np.int64


def idx_file_path(prefix_path) -> str:
    return prefix_path + ".idx"


def bin_file_path(prefix_path) -> str:
    return prefix_path + ".bin"


def _safe_readline(fd) -> str:
    # Borrow from fairseq, this part is to guarantee the boundary reading is safe
    pos = fd.tell()
    while True:
        try:
            return fd.readline()
        except UnicodeDecodeError:
            pos -= 1
            fd.seek(pos)  # search where this character begins


def find_offsets(filename: str, num_chunks: int) -> List[int]:
    """
    given a file and a number of chuncks, find the offsets in the file
    to be able to chunk around full lines.
    """
    with open(filename, "r", encoding="utf-8") as f:
        size = os.fstat(f.fileno()).st_size
        chunk_size = size // num_chunks
        offsets = [0 for _ in range(num_chunks + 1)]
        for i in range(1, num_chunks):
            f.seek(chunk_size * i)
            _safe_readline(f)
            offsets[i] = f.tell()
        offsets[-1] = size
        return offsets


def load_files_from_folder(data_dir: str, prefix_name: str) -> List[str]:
    """Load files from the target data folder."""
    if not os.path.exists(data_dir):
        raise FileNotFoundError(f'Not found data folder: {data_dir}')
    
    files = [os.path.join(data_dir, file) for file in os.listdir(data_dir) if file.startswith(prefix_name)]
    return files


def get_chunk_iterator(filename: str, start_offset: int, end_offset: int) -> Iterator:
    """Chunk File Iterator Loader"""
    with open(filename, 'r', encoding='utf-8') as input_file:
        input_file.seek(start_offset)
        line = _safe_readline(input_file)

        while line:
            pos = input_file.tell()
            if (
                end_offset > 0
                and pos > end_offset
                and pos < end_offset + 2**32
            ):
                break
            yield line
            line = input_file.readline()


def get_file_offsets(
    files: List[str],
    chunk_load: Optional[bool] = False,
    chunk_num: Optional[int] = 1
) -> List:
    """
    Build offset from files.
    """
    file_offsets = []

    # If disable chunk loading, 
    if not chunk_load:
        chunk_num = 1

    for filename in files:
        file_offset = find_offsets(filename, chunk_num)
        offsets = list(zip(file_offset[:-1], file_offset[1:]))
        for offset in offsets:
            file_offsets.append((filename, (offset[0], offset[1])))

    return file_offsets
