import torch

def print_gpu_utilization():
    from pyrsmi import rocml
    rocml.smi_initialize()
    ndevices = rocml.smi_get_device_count()
    used = [rocml.smi_get_device_memory_used(i) for i in range(ndevices)]
    s = "GPU memory used (MB): "
    for i in range(ndevices):
        s += f"[{i}]:{used[i]//1024**2} "
    print(s)
    rocml.smi_shutdown()
    return used


def print_cuda_memory_utilization(rank=0):
    used = torch.cuda.memory_allocated(rank)
    print(f"CUDA[{rank}] memory allocated: {used//1024**2} MB.")
