

import torch
from megatron.core import parallel_state as mpu

from .sequence_parallel import pad_to_sequence_parallel

def compute_transformers_input_shapes(batches, meta_info):
    from flash_attn.bert_padding import unpad_input

    input_shapes = []
    for model_inputs in batches:
        input_ids = model_inputs["input_ids"]
        attention_mask = model_inputs["attention_mask"]
        input_ids_rmpad = unpad_input(input_ids.unsqueeze(dim=-1), attention_mask)[0]
        if meta_info["sequence_parallel"]:
            input_ids_rmpad = pad_to_sequence_parallel(input_ids_rmpad)

            input_shapes.append(
                torch.Size(
                    [
                        input_ids_rmpad.shape[0] // mpu.get_tensor_model_parallel_world_size(),
                        1,
                        meta_info["hidden_size"],
                    ]
                )
            )
        else:

            input_shapes.append(torch.Size([input_ids_rmpad.shape[0], 1, meta_info["hidden_size"]]))
    return input_shapes

def make_batch_generator(batches, vpp_size):
    if vpp_size > 1:

        batch_generator = [batches] * vpp_size
        batch_generator = [iter(b) for b in batch_generator]
    else:

        batch_generator = iter(batches)
    return batch_generator
