from math import prod
from pathlib import Path
from typing import Dict, List, Optional, Union

import metatensor.torch
import torch
from metatensor.torch import Labels, TensorBlock, TensorMap
from metatensor.torch.atomistic import (
    MetatensorAtomisticModel,
    ModelCapabilities,
    ModelMetadata,
    ModelOutput,
    NeighborListOptions,
    System,
)
import copy
import ase.units

from ...utils.additive import ZBL, CompositionModel
from ...utils.data import DatasetInfo, TargetInfo
from ...utils.data.target_info import get_energy_target_info
from ...utils.dtype import dtype_to_str
from ...utils.long_range import DummyLongRangeFeaturizer, LongRangeFeaturizer
from ...utils.metadata import append_metadata_references
from ...utils.scaler import Scaler
from .modules.encoder import Encoder, NodeEncoder
from .modules.nef import (
    edge_array_to_nef,
    get_corresponding_edges,
    get_nef_indices,
    nef_array_to_edges,
)
from .modules.radial_mask import get_radial_mask
from .modules.structures import concatenate_structures
from .modules.transformer import Transformer
from .modules.baseline import Baseline


class NanoPET(torch.nn.Module):
    """
    Re-implementation of the PET architecture (https://arxiv.org/pdf/2305.19302).

    The positions and atomic species are encoded into a high-dimensional space
    using a simple encoder. The resulting features (in NEF, or Node-Edge-Feature
    format*) are then processed by a series of transformer layers. This process is
    repeated for a number of message-passing layers, where features are exchanged
    between corresponding edges (ij and ji). The final representation is used to
    predict atomic properties through decoders named "heads".

    * NEF format: a three-dimensional tensor where the first dimension corresponds
    to the nodes, the second to the edges corresponding to the neighbors of the
    node (padded as different nodes might have different numbers of edges),
    and the third to the features.
    """

    __supported_devices__ = ["cuda", "cpu"]
    __supported_dtypes__ = [torch.float64, torch.float32]
    __default_metadata__ = ModelMetadata(
        references={"architecture": ["https://arxiv.org/abs/2305.19302v3"]}
    )

    component_labels: Dict[str, List[List[Labels]]]

    def __init__(self, model_hypers: Dict, dataset_info: DatasetInfo) -> None:
        super().__init__()
        # checks on targets inside the RotationalAugmenter class in the trainer

        self.hypers = model_hypers
        self.dataset_info = dataset_info
        self.new_outputs = list(dataset_info.targets.keys())
        self.atomic_types = dataset_info.atomic_types

        self.requested_nl = NeighborListOptions(
            cutoff=self.hypers["cutoff"],
            full_list=True,
            strict=True,
        )

        self.cutoff = float(self.hypers["cutoff"])
        self.cutoff_width = float(self.hypers["cutoff_width"])

        self.node_encoder = NodeEncoder(len(self.atomic_types), self.hypers["d_pet"])
        self.edge_encoder = Encoder(len(self.atomic_types), self.hypers["d_pet"])

        self.transformer = Transformer(
            self.hypers["d_pet"],
            4 * self.hypers["d_pet"],
            self.hypers["num_heads"],
            self.hypers["num_attention_layers"],
            0.0,  # MLP dropout rate
            0.0,  # attention dropout rate
        )
        # empirically, the model seems to perform better without dropout

        self.num_mp_layers = self.hypers["num_gnn_layers"] - 1
        gnn_contractions = []
        gnn_transformers = []
        for _ in range(self.num_mp_layers):
            gnn_contractions.append(
                torch.nn.Linear(
                    2 * self.hypers["d_pet"], self.hypers["d_pet"], bias=False
                )
            )
            gnn_transformers.append(
                Transformer(
                    self.hypers["d_pet"],
                    4 * self.hypers["d_pet"],
                    self.hypers["num_heads"],
                    self.hypers["num_attention_layers"],
                    0.0,  # MLP dropout rate
                    0.0,  # attention dropout rate
                )
            )
        self.gnn_contractions = torch.nn.ModuleList(gnn_contractions)
        self.gnn_transformers = torch.nn.ModuleList(gnn_transformers)

        self.last_layer_feature_size = self.hypers["d_pet"]

        self.outputs = {
            "features": ModelOutput(unit="", per_atom=True)
        }  # the model is always capable of outputting the internal features

        self.heads = torch.nn.ModuleDict()
        self.head_types = self.hypers["heads"]
        self.last_layers = torch.nn.ModuleDict()
        self.output_shapes: Dict[str, Dict[str, List[int]]] = {}
        self.key_labels: Dict[str, Labels] = {}
        self.component_labels: Dict[str, List[List[Labels]]] = {}
        self.property_labels: Dict[str, List[Labels]] = {}
        for target_name, target_info in dataset_info.targets.items():
            self._add_output(target_name, target_info)
            # self.outputs[target_name] = ModelOutput(
            #     quantity=target_info.quantity,
            #     unit=target_info.unit,
            #     per_atom=True,
            # )
        # self._add_output("mtt::delta_q", dataset_info.targets[[k for k in dataset_info.targets.keys() if "_q" in k][0]])
        # self._add_output("mtt::p", dataset_info.targets[[k for k in dataset_info.targets.keys() if "p_" in k][0]])

        self.register_buffer(
            "species_to_species_index",
            torch.full(
                (max(self.atomic_types) + 1,),
                -1,
            ),
        )
        for i, species in enumerate(self.atomic_types):
            self.species_to_species_index[species] = i

        # long-range module
        if self.hypers["long_range"]["enable"]:
            self.long_range = True
            self.long_range_featurizer = LongRangeFeaturizer(
                self.hypers["long_range"],
                feature_dim=self.hypers["d_pet"],
                neighbor_list_options=self.requested_nl,
            )
        else:
            self.long_range = False
            self.long_range_featurizer = DummyLongRangeFeaturizer()  # for torchscript

        # additive models: these are handled by the trainer at training
        # time, and they are added to the output at evaluation time
        composition_model = CompositionModel(
            model_hypers={},
            dataset_info=DatasetInfo(
                length_unit=dataset_info.length_unit,
                atomic_types=self.atomic_types,
                targets={
                    target_name: target_info
                    for target_name, target_info in dataset_info.targets.items()
                    if CompositionModel.is_valid_target(target_name, target_info)
                },
            ),
        )
        additive_models = [composition_model]
        if self.hypers["zbl"]:
            additive_models.append(
                ZBL(
                    {},
                    dataset_info=DatasetInfo(
                        length_unit=dataset_info.length_unit,
                        atomic_types=self.atomic_types,
                        targets={
                            target_name: target_info
                            for target_name, target_info in dataset_info.targets.items()
                            if ZBL.is_valid_target(target_name, target_info)
                        },
                    ),
                )
            )
        # additive_models.append(Baseline({}, dataset_info))
        self.additive_models = torch.nn.ModuleList(additive_models)

        # scaler: this is also handled by the trainer at training time
        dataset_info_for_scaler = copy.deepcopy(dataset_info)
        n_time_steps = int([tk for tk in dataset_info_for_scaler.targets.keys() if "mtt::delta_" in tk and "_q" in tk][0].split("_")[1])
        dataset_info_for_scaler.targets[f"mtt::energy_{n_time_steps}"] = get_energy_target_info({"unit": "eV", "per_atom": False})
        self.scaler = Scaler(model_hypers={}, dataset_info=dataset_info_for_scaler)

        self.single_label = Labels.single()

        self.node_readout = torch.nn.Sequential(
            torch.nn.Linear(self.hypers["d_pet"], 4 * self.hypers["d_pet"]),
            torch.nn.SiLU(),
            torch.nn.Linear(4 * self.hypers["d_pet"], self.hypers["d_pet"]),
            torch.nn.SiLU(),
        )

        self.edge_readout = torch.nn.Sequential(
            torch.nn.Linear(self.hypers["d_pet"], 4 * self.hypers["d_pet"]),
            torch.nn.SiLU(),
            torch.nn.Linear(4 * self.hypers["d_pet"], self.hypers["d_pet"]),
            torch.nn.SiLU(),
        )

        if self.hypers["base_time_step"] is None:
            raise ValueError("`base_time_step` must be set in the model hypers")
        self.register_buffer("base_time_step", torch.tensor(float(self.hypers["base_time_step"])))

    def restart(self, dataset_info: DatasetInfo) -> "NanoPET":
        # merge old and new dataset info
        merged_info = self.dataset_info.union(dataset_info)
        new_atomic_types = [
            at for at in merged_info.atomic_types if at not in self.atomic_types
        ]
        new_targets = {
            key: value
            for key, value in merged_info.targets.items()
            if key not in self.dataset_info.targets
        }
        self.has_new_targets = len(new_targets) > 0

        if len(new_atomic_types) > 0:
            raise ValueError(
                f"New atomic types found in the dataset: {new_atomic_types}. "
                "The nanoPET model does not support adding new atomic types."
            )

        # register new outputs as new last layers
        for target_name, target in new_targets.items():
            self._add_output(target_name, target)

        self.dataset_info = merged_info

        # restart the composition and scaler models
        self.additive_models[0].restart(
            dataset_info=DatasetInfo(
                length_unit=dataset_info.length_unit,
                atomic_types=self.atomic_types,
                targets={
                    target_name: target_info
                    for target_name, target_info in dataset_info.targets.items()
                    if CompositionModel.is_valid_target(target_name, target_info)
                },
            ),
        )
        self.scaler.restart(dataset_info)

        return self

    def forward(
        self,
        systems: List[System],
        outputs: Dict[str, ModelOutput],
        selected_atoms: Optional[Labels] = None,
    ) -> Dict[str, TensorMap]:
        # Checks on systems (species) and outputs are done in the
        # MetatensorAtomisticModel wrapper

        device = systems[0].device

        if self.single_label.values.device != device:
            self.single_label = self.single_label.to(device)
            self.key_labels = {
                output_name: label.to(device)
                for output_name, label in self.key_labels.items()
            }
            self.component_labels = {
                output_name: [
                    [labels.to(device) for labels in components_block]
                    for components_block in components_tmap
                ]
                for output_name, components_tmap in self.component_labels.items()
            }
            self.property_labels = {
                output_name: [labels.to(device) for labels in properties_tmap]
                for output_name, properties_tmap in self.property_labels.items()
            }

        system_indices = torch.concatenate(
            [
                torch.full(
                    (len(system),),
                    i_system,
                    device=device,
                )
                for i_system, system in enumerate(systems)
            ],
        )

        sample_values = torch.stack(
            [
                system_indices,
                torch.concatenate(
                    [
                        torch.arange(
                            len(system),
                            device=device,
                        )
                        for system in systems
                    ],
                ),
            ],
            dim=1,
        )
        sample_labels = Labels(
            names=["system", "atom"],
            values=sample_values,
        )

        (
            positions,
            centers,
            neighbors,
            species,
            cells,
            cell_shifts,
        ) = concatenate_structures(systems, self.requested_nl)

        # somehow the backward of this operation is very slow at evaluation,
        # where there is only one cell, therefore we simplify the calculation
        # for that case
        if len(cells) == 1:
            cell_contributions = cell_shifts.to(cells.dtype) @ cells[0]
        else:
            cell_contributions = torch.einsum(
                "ab, abc -> ac",
                cell_shifts.to(cells.dtype),
                cells[system_indices[centers]],
            )

        edge_vectors = positions[neighbors] - positions[centers] + cell_contributions

        bincount = torch.bincount(centers)
        if bincount.numel() == 0:  # no edges
            max_edges_per_node = 0
        else:
            max_edges_per_node = int(torch.max(bincount))

        # Convert to NEF (Node-Edge-Feature) format:
        nef_indices, nef_to_edges_neighbor, nef_mask = get_nef_indices(
            centers, len(positions), max_edges_per_node
        )

        # Get radial mask
        r = torch.sqrt(torch.sum(edge_vectors**2, dim=-1))
        radial_mask = get_radial_mask(r, self.cutoff, self.cutoff - self.cutoff_width)

        # Element indices
        element_indices_nodes = self.species_to_species_index[species]
        element_indices_centers = element_indices_nodes[centers]
        element_indices_neighbors = element_indices_nodes[neighbors]

        # Send everything to NEF:
        edge_vectors = edge_array_to_nef(edge_vectors, nef_indices)
        radial_mask = edge_array_to_nef(
            radial_mask, nef_indices, nef_mask, fill_value=0.0
        )
        element_indices_centers = edge_array_to_nef(
            element_indices_centers, nef_indices
        )
        element_indices_neighbors = edge_array_to_nef(
            element_indices_neighbors, nef_indices
        )

        if "momenta" in systems[0].known_data():
            momenta = torch.concatenate(
                [system.get_data("momenta").block().values.squeeze(-1) for system in systems]
            )
        else:
            raise ValueError("Didn't find momenta :(")
            momenta = [torch.zeros_like(system.positions) for system in systems]

        # if "time_lag" in systems[0].known_data():
        #     time_lag = torch.concatenate(
        #         [system.get_data("time_lag").block().values for system in systems]
        #     )
        #     assert torch.max(time_lag - time_lag[0]) < 1e-6  # all the same
        #     time_lag_int = int(time_lag[0])
        # else:
        #     raise ValueError("Didn't find time_lag :(")
        # time_lag_edge = time_lag[sample_labels.column("system")][neighbors]
        # time_lag_edge = edge_array_to_nef(time_lag_edge, nef_indices)

        # Encode
        node_features = self.node_encoder(element_indices_nodes, momenta)
        edge_features = self.edge_encoder(
            {
                "cartesian": torch.concatenate(
                    [
                        edge_vectors,
                        torch.sqrt(torch.sum(edge_vectors**2, keepdim=True, dim=-1)),
                    ],
                    dim=-1,
                ),
                "neighbor": element_indices_neighbors,
            }
        )
        features = torch.cat([node_features.unsqueeze(1), edge_features], dim=1)

        # Transformer
        radial_mask = torch.cat(
            [
                torch.ones(
                    (len(node_features), 1),
                    device=radial_mask.device,
                    dtype=radial_mask.dtype,
                ),
                radial_mask,
            ],
            dim=-1,
        )
        features = self.transformer(features, radial_mask)

        node_features, edge_features = features[:, 0], features[:, 1:]

        # GNN
        if self.num_mp_layers > 0:
            corresponding_edges = get_corresponding_edges(
                torch.concatenate(
                    [centers.unsqueeze(-1), neighbors.unsqueeze(-1), cell_shifts],
                    dim=-1,
                )
            )
            for contraction, transformer in zip(
                self.gnn_contractions, self.gnn_transformers
            ):
                new_edge_features = nef_array_to_edges(
                    edge_features, centers, nef_to_edges_neighbor
                )
                corresponding_new_edge_features = new_edge_features[corresponding_edges]
                new_edge_features = torch.concatenate(
                    [new_edge_features, corresponding_new_edge_features], dim=-1
                )
                new_edge_features = contraction(new_edge_features)
                new_edge_features = edge_array_to_nef(new_edge_features, nef_indices)
                new_features = torch.cat(
                    [node_features.unsqueeze(1), new_edge_features], dim=1
                )
                new_features = transformer(new_features, radial_mask)
                new_node_features, new_edge_features = (
                    new_features[:, 0],
                    new_features[:, 1:],
                )
                node_features = (node_features + new_node_features) * 0.5**0.5
                edge_features = (edge_features + new_edge_features) * 0.5**0.5

        edge_features = features * radial_mask[:, :, None]
        node_features = torch.sum(
            self.edge_readout(edge_features), dim=1
        ) + self.node_readout(node_features)

        return_dict: Dict[str, TensorMap] = {}

        # output the hidden features, if requested:
        if "features" in outputs:
            feature_tmap = TensorMap(
                keys=self.single_label,
                blocks=[
                    TensorBlock(
                        values=node_features,
                        samples=sample_labels,
                        components=[],
                        properties=Labels(
                            names=["properties"],
                            values=torch.arange(
                                node_features.shape[-1], device=node_features.device
                            ).reshape(-1, 1),
                        ),
                    )
                ],
            )
            features_options = outputs["features"]
            if features_options.per_atom:
                return_dict["features"] = feature_tmap
            else:
                return_dict["features"] = metatensor.torch.sum_over_samples(
                    feature_tmap, ["atom"]
                )

        atomic_features_dict: Dict[str, torch.Tensor] = {}
        for output_name, head in self.heads.items():
            atomic_features_dict[output_name] = head(node_features)

        # output the last-layer features for the outputs, if requested:
        for output_name in outputs.keys():
            if not (
                output_name.startswith("mtt::aux::")
                and output_name.endswith("_last_layer_features")
            ):
                continue
            base_name = output_name.replace("mtt::aux::", "").replace(
                "_last_layer_features", ""
            )
            # the corresponding output could be base_name or mtt::base_name
            if (
                f"mtt::{base_name}" not in atomic_features_dict
                and base_name not in atomic_features_dict
            ):
                raise ValueError(
                    f"Features {output_name} can only be requested "
                    f"if the corresponding output {base_name} is also requested."
                )
            if f"mtt::{base_name}" in atomic_features_dict:
                base_name = f"mtt::{base_name}"
            last_layer_feature_tmap = TensorMap(
                keys=self.single_label,
                blocks=[
                    TensorBlock(
                        values=atomic_features_dict[base_name],
                        samples=sample_labels,
                        components=[],
                        properties=Labels(
                            names=["properties"],
                            values=torch.arange(
                                atomic_features_dict[base_name].shape[-1],
                                device=atomic_features_dict[base_name].device,
                            ).reshape(-1, 1),
                        ),
                    )
                ],
            )
            last_layer_features_options = outputs[output_name]
            if last_layer_features_options.per_atom:
                return_dict[output_name] = last_layer_feature_tmap
            else:
                return_dict[output_name] = metatensor.torch.sum_over_samples(
                    last_layer_feature_tmap, ["atom"]
                )

        masses = torch.concatenate(
            [system.get_data("masses").block().values for system in systems]
        )  # [N, 1]

        atomic_properties_tmap_dict: Dict[str, TensorMap] = {}
        for output_name, last_layer in self.last_layers.items():
            if output_name in outputs:
                atomic_features = atomic_features_dict[output_name]
                atomic_properties_by_block = []
                atomic_uncertainties_by_block = []
                for last_layer_by_block in last_layer.values():
                    last_layer_features = last_layer_by_block(atomic_features)
                    atomic_properties_by_block.append(
                        last_layer_features[:, :-1]
                    )
                    atomic_uncertainties_by_block.append(last_layer_features[:, -1:])

                if output_name.startswith("mtt::p_"):  # conservation of momentum
                    scaled_momenta_new = atomic_properties_by_block[0].reshape(-1, 3)
                    scaled_momenta_new = scaled_momenta_new * self.scaler.get_scales_dict()[output_name]

                    split_sizes = [len(system) for system in systems]
                    scaled_momenta_new_by_structure = torch.split(scaled_momenta_new, split_sizes)
                    momenta_old_by_structure = torch.split(momenta.reshape(-1, 3), split_sizes)
                    masses_by_structure = torch.split(masses, split_sizes)

                    scaled_momenta_new_list = []
                    for scaled_momenta_new_s, momenta_old_s, masses_s in zip(
                        scaled_momenta_new_by_structure,
                        momenta_old_by_structure,
                        masses_by_structure,
                    ):
                        velocities_new_s = scaled_momenta_new_s / torch.sqrt(masses_s)
                        velocities_old_s = momenta_old_s / masses_s
                        M_s = torch.sum(masses_s)
                        velocities_com_new_s = torch.sum(masses_s * velocities_new_s, dim=0) / M_s
                        velocities_com_old_s = torch.sum(masses_s * velocities_old_s, dim=0) / M_s
                        velocities_new_s = velocities_new_s - velocities_com_new_s + velocities_com_old_s 
                        scaled_momenta_new_s = velocities_new_s * torch.sqrt(masses_s)
                        scaled_momenta_new_list.append(scaled_momenta_new_s)
                    scaled_momenta_new = torch.cat(scaled_momenta_new_list, dim=0)

                    scaled_momenta_new = scaled_momenta_new / self.scaler.get_scales_dict()[output_name]
                    atomic_properties_by_block[0] = scaled_momenta_new

                if output_name.startswith("mtt::delta_") and output_name.endswith("_q"):  # uniform linear motion
                    delta_t = int(output_name.split("_")[1]) * self.base_time_step * ase.units.fs
                    scaled_delta_q = atomic_properties_by_block[0].reshape(-1, 3)
                    scaled_delta_q = scaled_delta_q * self.scaler.get_scales_dict()[output_name]

                    split_sizes = [len(system) for system in systems]
                    scaled_delta_q_by_structure = torch.split(scaled_delta_q, split_sizes)
                    momenta_old_by_structure = torch.split(momenta.reshape(-1, 3), split_sizes)
                    masses_by_structure = torch.split(masses, split_sizes)

                    scaled_delta_q_list = []
                    for scaled_delta_q_s, momenta_old_s, masses_s in zip(
                        scaled_delta_q_by_structure,
                        momenta_old_by_structure,
                        masses_by_structure,
                    ):
                        velocities_old_s = momenta_old_s / masses_s
                        M_s = torch.sum(masses_s)
                        velocities_com_old_s = torch.sum(masses_s * velocities_old_s, dim=0) / M_s
                        delta_q_s = scaled_delta_q_s / torch.sqrt(masses_s)
                        delta_q_com_s = torch.sum(masses_s * delta_q_s, dim=0) / M_s
                        delta_q_s = delta_q_s - delta_q_com_s + velocities_com_old_s * delta_t
                        scaled_delta_q_s = delta_q_s * torch.sqrt(masses_s)
                        scaled_delta_q_list.append(scaled_delta_q_s)
                    scaled_delta_q = torch.cat(scaled_delta_q_list, dim=0)

                    scaled_delta_q = scaled_delta_q / self.scaler.get_scales_dict()[output_name]
                    atomic_properties_by_block[0] = scaled_delta_q

                blocks = [
                    TensorBlock(
                        values=atomic_property.reshape([-1] + shape),
                        samples=sample_labels,
                        components=components,
                        properties=properties,
                    )
                    for atomic_property, shape, components, properties in zip(
                        atomic_properties_by_block,
                        self.output_shapes[output_name].values(),
                        self.component_labels[output_name],
                        self.property_labels[output_name],
                    )
                ]
                atomic_properties_tmap_dict[output_name] = TensorMap(
                    keys=self.key_labels[output_name],
                    blocks=blocks,
                )

                atomic_uncertaintiy_blocks = [
                    TensorBlock(
                        values=torch.nn.functional.softplus(atomic_uncertainty).reshape([-1, 1]) + 1e-6,
                        samples=sample_labels,
                        components=[],
                        properties=self.single_label,
                    )
                    for atomic_uncertainty in atomic_uncertainties_by_block
                ]
                atomic_properties_tmap_dict[f"mtt::aleatoric_{output_name.split('mtt::')[1]}"] = TensorMap(
                    keys=self.key_labels[output_name],
                    blocks=atomic_uncertaintiy_blocks,
                )

        if selected_atoms is not None:
            for output_name, tmap in atomic_properties_tmap_dict.items():
                atomic_properties_tmap_dict[output_name] = metatensor.torch.slice(
                    tmap, axis="samples", selection=selected_atoms
                )

        for output_name, atomic_property in atomic_properties_tmap_dict.items():
            if output_name not in outputs:
                return_dict[output_name] = atomic_property
            else:
                if outputs[output_name].per_atom:
                    return_dict[output_name] = atomic_property
                else:
                    return_dict[output_name] = metatensor.torch.sum_over_samples(
                        atomic_property, ["atom"]
                    )

        if not self.training:
            # at evaluation, we also introduce the scaler and additive contributions
            return_dict = self.scaler(return_dict)
            for additive_model in self.additive_models:
                outputs_for_additive_model: Dict[str, ModelOutput] = {}
                for name, output in outputs.items():
                    if name in additive_model.outputs:
                        outputs_for_additive_model[name] = output
                additive_contributions = additive_model(
                    systems,
                    outputs_for_additive_model,
                    selected_atoms,
                )
                for name in additive_contributions:
                    return_dict[name] = metatensor.torch.add(
                        return_dict[name],
                        additive_contributions[name],
                    )

        return return_dict

    def requested_neighbor_lists(
        self,
    ) -> List[NeighborListOptions]:
        return [self.requested_nl]

    @classmethod
    def load_checkpoint(cls, path: Union[str, Path]) -> "NanoPET":
        # Load the checkpoint
        checkpoint = torch.load(path, weights_only=False, map_location="cpu")
        model_data = checkpoint["model_data"]
        model_state_dict = checkpoint["model_state_dict"]

        # Create the model
        model = cls(**model_data)
        state_dict_iter = iter(model_state_dict.values())
        next(state_dict_iter)  # skip `species_to_species_index` buffer (int)
        dtype = next(state_dict_iter).dtype
        model.to(dtype).load_state_dict(model_state_dict)

        return model

    def export(
        self, metadata: Optional[ModelMetadata] = None
    ) -> MetatensorAtomisticModel:
        dtype = next(self.parameters()).dtype
        if dtype not in self.__supported_dtypes__:
            raise ValueError(f"unsupported dtype {dtype} for NanoPET")

        # Make sure the model is all in the same dtype
        # For example, after training, the additive models could still be in
        # float64
        self.to(dtype)

        # Additionally, the composition model contains some `TensorMap`s that cannot
        # be registered correctly with Pytorch. This funciton moves them:
        self.additive_models[0]._move_weights_to_device_and_dtype(
            torch.device("cpu"), torch.float64
        )

        interaction_ranges = [self.hypers["num_gnn_layers"] * self.hypers["cutoff"]]
        for additive_model in self.additive_models:
            if hasattr(additive_model, "cutoff_radius"):
                interaction_ranges.append(additive_model.cutoff_radius)
            if self.long_range:
                interaction_ranges.append(torch.inf)
        interaction_range = max(interaction_ranges)

        capabilities = ModelCapabilities(
            outputs=self.outputs,
            atomic_types=self.atomic_types,
            interaction_range=interaction_range,
            length_unit=self.dataset_info.length_unit,
            supported_devices=self.__supported_devices__,
            dtype=dtype_to_str(dtype),
        )

        if metadata is None:
            metadata = ModelMetadata()

        append_metadata_references(metadata, self.__default_metadata__)

        return MetatensorAtomisticModel(self.eval(), metadata, capabilities)

    def _add_output(self, target_name: str, target_info: TargetInfo) -> None:
        # one output shape for each tensor block, grouped by target (i.e. tensormap)
        self.output_shapes[target_name] = {}
        for key, block in target_info.layout.items():
            dict_key = target_name
            for n, k in zip(key.names, key.values):
                dict_key += f"_{n}_{int(k)}"
            self.output_shapes[target_name][dict_key] = [
                len(comp.values) for comp in block.components
            ] + [len(block.properties.values)]

        self.outputs[target_name] = ModelOutput(
            quantity=target_info.quantity,
            unit=target_info.unit,
            per_atom=True,
        )
        if (
            target_name not in self.head_types  # default to MLP
            or self.head_types[target_name] == "mlp"
        ):
            self.heads[target_name] = torch.nn.Sequential(
                torch.nn.Linear(
                    self.hypers["d_pet"], 4 * self.hypers["d_pet"], bias=False
                ),
                torch.nn.SiLU(),
                torch.nn.Linear(
                    4 * self.hypers["d_pet"], self.hypers["d_pet"], bias=False
                ),
                torch.nn.SiLU(),
            )
        elif self.head_types[target_name] == "linear":
            self.heads[target_name] = torch.nn.Sequential()
        else:
            raise ValueError(
                f"Unsupported head type {self.head_types[target_name]} "
                f"for target {target_name}"
            )

        ll_features_name = (
            f"mtt::aux::{target_name.replace('mtt::', '')}_last_layer_features"
        )
        self.outputs[ll_features_name] = ModelOutput(per_atom=True)

        self.last_layers[target_name] = torch.nn.ModuleDict(
            {
                key: torch.nn.Linear(
                    self.hypers["d_pet"],
                    prod(shape) + 1,  # +1 for the variance
                    bias=False,
                )
                for key, shape in self.output_shapes[target_name].items()
            }
        )

        self.key_labels[target_name] = target_info.layout.keys
        self.component_labels[target_name] = [
            block.components for block in target_info.layout.blocks()
        ]
        self.property_labels[target_name] = [
            block.properties for block in target_info.layout.blocks()
        ]


def manual_prod(shape: List[int]) -> int:
    # prod from standard library not supported in torchscript
    result = 1
    for dim in shape:
        result *= dim
    return result
