import json
from pathlib import Path
from textwrap import dedent
import datetime
import functools
import types
import typing

import pydantic
import mlcroissant as mlc  # type: ignore

from . import validate

_distribution: list[mlc.FileObject | mlc.FileSet] = [
    mlc.FileObject(
        id="repo",
        name="repo",
        content_url="https://example.com",
        encoding_format="git+https",
        sha256="main",
    ),
    mlc.FileSet(
        id="system-metadata-files",
        name="system-metadata-files",
        contained_in=["repo"],
        includes=["systems/*/system.json"],
        encoding_format="application/json",
    ),
    mlc.FileSet(
        id="system-metadata-files-raw",
        name="system-metadata-files-raw",
        contained_in=["repo"],
        includes=["systems/*/system.json"],
        encoding_format="text/plain",
    ),
    mlc.FileSet(
        id="corpus-metadata-files",
        name="corpus-metadata-files",
        contained_in=["repo"],
        includes=["systems/*/data/*/metadata.json"],
        encoding_format="application/json",
    ),
    mlc.FileSet(
        id="corpus-metadata-files-raw",
        name="corpus-metadata-files-raw",
        contained_in=["repo"],
        includes=["systems/*/data/*/metadata.json"],
        encoding_format="text/plain",
    ),
]

_metadata = mlc.Metadata(
    name="ELCC",
    description="ELCC is a collection of emergent language corpora with accompanying metadata and analyses.",
    license=["https://creativecommons.org/licenses/by/4.0/"],
    url="https://example.com",
    date_published=datetime.datetime.now(datetime.UTC),
    cite_as=dedent(
        """\
        """
    ),
    version="0.1.0",
    keywords=["emergent communication", "emergent language", "corpus"],
    distribution=_distribution,
    record_sets=[],
)


def insert_corpora(metadata: mlc.Metadata) -> None:
    paths = sorted(list(Path("systems").glob("*/data/*/corpus.json")))
    for path in paths:
        comps = str(path).split("/")
        name = f"{comps[-4]}/{comps[-2]}"

        metadata.distribution.append(
            mlc.FileObject(
                id=str(path),
                name=str(path),
                content_url=str(path),
                encoding_format="application/json",
                contained_in=["repo"],
            )
        )
        metadata.record_sets.append(
            mlc.RecordSet(
                id=name,
                name=name,
                fields=[
                    mlc.Field(
                        id=f"{name}/line",
                        name="line",
                        data_types=mlc.DataType.INTEGER,
                        repeated=True,
                        source=mlc.Source(
                            file_object=str(path),
                            extract=mlc.Extract(json_path="$[*]"),
                        ),
                    ),
                ],
            )
        )


def make_system_field(base_name: str, typ: type | None) -> mlc.Field:
    name = f"system-metadata/{base_name}"
    jp = f"$.{base_name}"

    type_map = [
        (int, mlc.DataType.INTEGER),
        (float, mlc.DataType.FLOAT),
        (str, mlc.DataType.TEXT),
        (bool, mlc.DataType.BOOL),
    ]
    mlc_typ = None
    for x, y in type_map:
        if isinstance(typ, types.UnionType):
            not_none = [x for x in typ.__args__ if not x == type(None)]
            if len(typ.__args__) > 2 or len(not_none) != 1:
                raise ValueError(f"Cannot handle {typ}")
            typ = not_none[0]
        elif isinstance(typ, typing._LiteralGenericAlias):
            typ = type(typ.__args__[0])
        elif base_name == "system.data_source":
            typ = str
        if issubclass(typ, x):
            mlc_typ = y
            break
    if mlc_typ is None:
        mlc_typ = mlc.DataType.TEXT

    return mlc.Field(
        id=name,
        name=base_name,
        data_types=[mlc_typ],
        source=mlc.Source(
            file_set="system-metadata-files",
            extract=mlc.Extract(json_path=jp),
        ),
    )


def insert_system_md(metadata: mlc.Metadata) -> None:
    metadata.record_sets.append(
        mlc.RecordSet(
            id="system-metadata-raw",
            name="system-metadata-raw",
            fields=[
                mlc.Field(
                    id="system-metadata-raw/path",
                    name="path",
                    data_types=[mlc.DataType.TEXT],
                    source=mlc.Source(
                        file_set="system-metadata-files-raw",
                        extract=mlc.Extract(file_property=mlc.FileProperty.fullpath),
                    ),
                    # References always seem to cause conflicting read method errors.
                    # references={"field": {"@id": "system-metadata/path"}},
                ),
                mlc.Field(
                    id="system-metadata-raw/raw",
                    name="raw",
                    data_types=[mlc.DataType.TEXT],
                    source=mlc.Source(
                        file_set="system-metadata-files-raw",
                        extract=mlc.Extract(file_property=mlc.FileProperty.content),
                    ),
                ),
            ],
        )
    )

    fields = [
        mlc.Field(
            id="system-metadata/path",
            name="path",
            data_types=[mlc.DataType.TEXT],
            source=mlc.Source(
                file_set="system-metadata-files",
                extract=mlc.Extract(file_property=mlc.FileProperty.fullpath),
            ),
        ),
    ]
    for k0, v0 in validate.SystemMetadata.model_fields.items():
        assert v0.annotation is not None
        if isinstance(v0.annotation, type) and issubclass(
            v0.annotation, pydantic.BaseModel
        ):
            # Only doing one level of nesting for now.
            for k1, v1 in v0.annotation.model_fields.items():
                if k1 == "variants":
                    continue
                fields.append(make_system_field(f"{k0}.{k1}", v1.annotation))
        else:
            if k0 == "notes":
                continue
            fields.append(make_system_field(k0, v0.annotation))

    metadata.record_sets.append(
        mlc.RecordSet(
            id="system-metadata",
            name="system-metadata",
            fields=fields,
        )
    )


def insert_corpus_md(metadata: mlc.Metadata) -> None:
    metadata.record_sets.append(
        mlc.RecordSet(
            id="corpus-metadata-raw",
            name="corpus-metadata-raw",
            fields=[
                mlc.Field(
                    id="corpus-metadata-raw/path",
                    name="path",
                    data_types=[mlc.DataType.TEXT],
                    source=mlc.Source(
                        file_set="corpus-metadata-files-raw",
                        extract=mlc.Extract(file_property=mlc.FileProperty.fullpath),
                    ),
                    # References always seem to cause conflicting read method errors.
                    # references={"field": {"@id": "corpus-metadata/path"}},
                ),
                mlc.Field(
                    id="corpus-metadata-raw/raw",
                    name="raw",
                    data_types=[mlc.DataType.TEXT],
                    source=mlc.Source(
                        file_set="corpus-metadata-files-raw",
                        extract=mlc.Extract(file_property=mlc.FileProperty.content),
                    ),
                ),
            ],
        )
    )

    exemplar_path = "systems/nav-to-center/data/temperature_10/metadata.json"
    with open(exemplar_path) as fo:
        exemplar_data = json.load(fo)
    fields = [
        mlc.Field(
            id="corpus-metadata/path",
            name="path",
            data_types=[mlc.DataType.TEXT],
            source=mlc.Source(
                file_set="corpus-metadata-files",
                extract=mlc.Extract(file_property=mlc.FileProperty.fullpath),
                # transforms=[mlc.Transform(regex=r"(....)")],
            ),
            # References always seem to cause conflicting read method errors.
            # references={"field": {"@id": "corpus-metadata/path"}},
        ),
    ]
    items = exemplar_data["metrics"]["analysis"].items()
    for k, v in items:
        name = f"metrics.analysis.{k}".replace(" ", "_").lower()
        if isinstance(v, int):
            typ = mlc.DataType.INTEGER
        elif isinstance(v, bool):
            typ = mlc.DataType.BOOL
        else:
            typ = mlc.DataType.FLOAT
        jp = f'$.metrics.analysis["{k}"]'
        fields.append(
            mlc.Field(
                id=f"corpus-metadata/{name}",
                name=name,
                data_types=[typ],
                source=mlc.Source(
                    file_set="corpus-metadata-files",
                    # extract=mlc.Extract(file_property=mlc.FileProperty.fullpath),
                    extract=mlc.Extract(json_path=jp),
                ),
            )
        )

    metadata.record_sets.append(
        mlc.RecordSet(
            id="corpus-metadata",
            name="corpus-metadata",
            fields=fields,
        )
    )


@functools.cache
def get_metadata() -> mlc.Metadata:
    insert_system_md(_metadata)
    insert_corpus_md(_metadata)
    insert_corpora(_metadata)
    return _metadata


def save_metadata() -> None:
    metadata = get_metadata()
    print(_metadata.issues.report())
    with open("croissant.json", "w") as fo:
        d = metadata.to_json()
        d["datePublished"] = str(d["datePublished"])
        json.dump(d, fo, indent=2)


def test() -> None:
    dataset = mlc.Dataset(jsonld="croissant.json")
    # records = dataset.records(record_set="babyai-sr/GoToObj")
    # records = dataset.records(record_set="system-metadata")
    # records = dataset.records(record_set="system-metadata-raw")
    # records = dataset.records(record_set="corpus-metadata")
    records = dataset.records(record_set="corpus-metadata-raw")

    for i, x in enumerate(records):
        print(i, x)
        if i > 10:
            break
            pass


if __name__ == "__main__":
    save_metadata()
    # test()
