# Import Python packages.
import os
from typing import Any, Mapping, Optional, Sequence, Tuple

# Import external packages.
import boto3  # type: ignore[import-untyped]
import pandas as pd

# Import PyTest packagtes.
import pytest

# Import PyTest external packages.
from moto import mock_aws
from py._path.local import LocalPath

# Import developing library.
import fin_tech_py_toolkit as lib

from ...data.test_tabular import synthesize as synthesize_dataframe

# Import testing library.
from ...utils import eq_dataframe
from ..utils import template_test_dataset


# Runtime constants.
IDENTIFIER = lib.datasets.DatasetTabularSimple._IDENTIFIER


def synthesize(
    root: str, relpath: str, /, *, s3: bool, cache_read: bool  # noqa: W504
) -> Tuple[
    Sequence[pd.DataFrame], Sequence[Tuple[str, Optional[str]]], Sequence[Any], Mapping[str, Any]
]:
    r"""
    Synthesize test I/O.

    Args
    ----
    - root
        Root directory.
    - relpath
        Relative directory to root to store synthetic data.
    - s3
        If True, mocking data storage on S3 bucket.
    - cache_read
        If True, non-local data (e.g., from URLs) will be cached into class cache directory on
        local file system.

    Returns
    -------
    - sources.
        Data sources corresponding to loaded dataset memory.
    - addresses
        Addresses corresponding to those data sources.
    - args
        Positional arguments for dataset memory loading.
    - kwargs
        Keyword arguments for dataset memory loading.
    """
    # Generate source data files.
    directory = os.path.join(root, relpath)
    lib.io.mkdirs(directory)
    source = synthesize_dataframe(irregular=False)
    columns = list(source.columns)
    sources = []
    addresses = []
    cache_relpaths: Mapping[str, str]
    cache_relpaths = {}

    # Common format is a regular table.
    source1 = source.copy()
    source1.to_csv(os.path.join(directory, "source1.csv"), index=False)
    sources.append(source)
    addresses.append(("source1", os.path.join(directory, "source1.csv")))

    # Other format can be the table without header.
    source2 = source.copy()
    source2.to_csv(os.path.join(directory, "source2.csv"), index=False, header=False)
    sources.append(source)
    addresses.append(("source2", os.path.join(directory, "source2.csv")))

    # Other format can be the table with unrelated information at beginning and ending.
    source3 = source.copy()
    text = source3.to_csv(None, index=False)
    text = "\n".join(["head-garbage", text.strip(), "tail-garbage"])
    with open(os.path.join(directory, "source3.csv"), "w") as file:
        # Save modified data text.
        file.write(text)
    sources.append(source)
    addresses.append(("source3", os.path.join(directory, "source3.csv")))

    # Other format can be the table with unncessary columns.
    source4 = source.copy()
    source4["garbage"] = ["garbage"] * len(source4)
    source4.to_csv(os.path.join(directory, "source4.csv"))
    sources.append(source)
    addresses.append(("source4", os.path.join(directory, "source4.csv")))

    # Mock S3 bucket.
    if s3:
        # Upload source files and replace local paths by S3 URLs.
        s3client = boto3.client("s3")
        s3client.create_bucket(Bucket=relpath)
        for name, address in addresses:
            # Upload to s3 bucket.
            assert address is not None
            s3client.upload_file(address, relpath, os.path.basename(address))
            cache_relpaths[name] = os.path.basename(address)
        addresses = [
            (name, f"s3://{relpath:s}/{os.path.basename(address):s}") for name, address in addresses
        ]

    # Constuct arguments based on synthesis process.
    return (
        sources,
        addresses,
        [],
        dict(
            cache_read=cache_read,
            cache_relpaths=cache_relpaths,
            read_args=[(["source2", "source3", "source4"], [])],
            read_kwargs=[
                (["source2"], dict(header=None, names=columns)),
                (["source3"], dict(skiprows=1, skipfooter=1, engine="python")),
                (["source4"], dict(usecols=columns)),
            ],
            sorts=("identity", "identity"),
        ),
    )


@pytest.mark.parametrize(
    ("link", "s3", "cache_read"),
    [
        pytest.param(True, False, False, id="link"),
        pytest.param(False, False, False, id="copy"),
        pytest.param(False, True, True, id="s3-offline"),
        pytest.param(False, True, False, id="s3-online"),
    ],
)
@mock_aws
def test_default(*, tmpdir: LocalPath, link: bool, s3: bool, cache_read: bool) -> None:
    r"""
    Test dataset loading transformation.

    Args
    ----
    - tmpdir
        Temporary directory for this test.
        It is automatically provided by PyTest, so its value should not be explicitly defined.
    - link
        If True, the dataset will be treated as a link of original data, thus when we save the
        dataset, it will only save addresses of original data.
        If False, the dataset will create a copy of original data under its own format for
        saving.
    - s3
        If True, mocking data storage on S3 bucket.
    - cache_read
        If True, non-local data (e.g., from URLs) will be cached into class cache directory on
        local file system.

    Returns
    -------
    """
    # Initialize testing transformation.
    root = str(tmpdir)
    factory = lib.transforms.FactoryTransform()

    # Generate source data files.
    sources, addresses, memorize_args, memorize_kwargs = synthesize(
        root, "src", s3=s3, cache_read=cache_read
    )

    # Run test template.
    template_test_dataset(
        root,
        IDENTIFIER,
        factory,
        sources,
        addresses,
        eq_dataframe,
        link=link,
        memorize_args=memorize_args,
        memorize_kwargs=memorize_kwargs,
    )


@pytest.mark.parametrize(
    ("read_args", "read_kwargs", "sorts"),
    [
        pytest.param(
            [],
            [(["source"], {})],
            ("identity", "identity"),
            id="args",
            marks=[pytest.mark.xfail(raises=AssertionError)],
        ),
        pytest.param(
            [(["source"], [])],
            [],
            ("identity", "identity"),
            id="kwargs",
            marks=[pytest.mark.xfail(raises=AssertionError)],
        ),
        pytest.param(
            [(["source"], [])],
            [(["source"], {})],
            None,
            id="sorts",
            marks=[pytest.mark.xfail(raises=AssertionError)],
        ),
    ],
)
def test_read_missing_loud(
    *,
    tmpdir: LocalPath,
    read_args: Sequence[Tuple[Sequence[str], Sequence[Any]]],
    read_kwargs: Sequence[Tuple[Sequence[str], Mapping[str, Any]]],
    sorts: Optional[Tuple[str, str]],
) -> None:
    r"""
    Test loud error raising when reading arguments are missing.

    Args
    ----
    - read_args
        A collection of positional arguments for reading data from named addresses.
        Each item of the collection is a pair of name groups and sharing positional arguments.
        Name groups of different items should not overlap.
    - read_kwargs
        A collection of keyword arguments for reading data from named addresses.
        Each item of the collection is a pair of name groups and sharing keyword arguments.
        Name groups of different items should not overlap.
    - sorts
        Column and row sorting algorithms for disambiguiation on homogeneous tabular data from
        each named addresses.

    Returns
    -------
    """
    # Initialize testing transformation.
    root = str(tmpdir)
    factory = lib.transforms.FactoryTransform()

    # Generate source data files.
    directory = os.path.join(root, "src")
    lib.io.mkdirs(directory)
    source = synthesize_dataframe(irregular=False)
    source.to_csv(os.path.join(directory, "source.csv"), index=False)
    sources = [source]
    addresses: Sequence[Tuple[str, Optional[str]]]
    addresses = [("source", os.path.join(directory, "source.csv"))]
    memorize_args: Sequence[Any]
    memorize_args = []
    memorize_kwargs = dict(
        read_args=read_args, read_kwargs=read_kwargs, read_silent_default=False, sorts=sorts
    )

    # Run test template.
    template_test_dataset(
        root,
        IDENTIFIER,
        factory,
        sources,
        addresses,
        eq_dataframe,
        link=True,
        memorize_args=memorize_args,
        memorize_kwargs=memorize_kwargs,
    )
