{"lxmert": {"language_embedding": ["Embedding-1", "Embedding-2", "Embedding-3", "LayerNorm-1"], "vision_object_encoder": ["Linear-1", "LayerNorm-2", "Linear-2", "LayerNorm-3"], "language_encoder": ["Linear-3", "Linear-4", "Linear-5", "LxmertAttention-1-1", "Linear-6", "LayerNorm-4", "Linear-7", "GELUActivation-1", "Linear-8", "LayerNorm-5", "Linear-9", "Linear-10", "Linear-11", "LxmertAttention-3-1", "Linear-12", "LayerNorm-6", "Linear-13", "GELUActivation-25", "Linear-14", "LayerNorm-7", "Linear-15", "Linear-16", "Linear-17", "LxmertAttention-5-1", "Linear-18", "LayerNorm-8", "Linear-19", "GELUActivation-49", "Linear-20", "LayerNorm-9", "Linear-21", "Linear-22", "Linear-23", "LxmertAttention-7-1", "Linear-24", "LayerNorm-10", "Linear-25", "GELUActivation-73", "Linear-26", "LayerNorm-11", "Linear-27", "Linear-28", "Linear-29", "LxmertAttention-9-1", "Linear-30", "LayerNorm-12", "Linear-31", "GELUActivation-97", "Linear-32", "LayerNorm-13", "Linear-33", "Linear-34", "Linear-35", "LxmertAttention-11-1", "Linear-36", "LayerNorm-14", "Linear-37", "GELUActivation-121", "Linear-38", "LayerNorm-15", "Linear-39", "Linear-40", "Linear-41", "LxmertAttention-13-1", "Linear-42", "LayerNorm-16", "Linear-43", "GELUActivation-145", "Linear-44", "LayerNorm-17", "Linear-45", "Linear-46", "Linear-47", "LxmertAttention-15-1", "Linear-48", "LayerNorm-18", "Linear-49", "GELUActivation-169", "Linear-50", "LayerNorm-19", "Linear-51", "Linear-52", "Linear-53", "LxmertAttention-17-1", "Linear-54", "LayerNorm-20", "Linear-55", "GELUActivation-193", "Linear-56", "LayerNorm-21"], "vision_encoder": ["Linear-57", "Linear-58", "Linear-59", "LxmertAttention-19-1", "Linear-60", "LayerNorm-22", "Linear-61", "GELUActivation-217", "Linear-62", "LayerNorm-23", "Linear-63", "Linear-64", "Linear-65", "LxmertAttention-21-1", "Linear-66", "LayerNorm-24", "Linear-67", "GELUActivation-241", "Linear-68", "LayerNorm-25", "Linear-69", "Linear-70", "Linear-71", "LxmertAttention-23-1", "Linear-72", "LayerNorm-26", "Linear-73", "GELUActivation-265", "Linear-74", "LayerNorm-27", "Linear-75", "Linear-76", "Linear-77", "LxmertAttention-25-1", "Linear-78", "LayerNorm-28", "Linear-79", "GELUActivation-289", "Linear-80", "LayerNorm-29", "Linear-81", "Linear-82", "Linear-83", "LxmertAttention-27-1", "Linear-84", "LayerNorm-30", "Linear-85", "GELUActivation-313", "Linear-86", "LayerNorm-31"], "lang_cross": ["Linear-87", "LxmertAttention-29-1", "Linear-90", "LayerNorm-32", "Linear-92", "Linear-93", "Linear-95", "Linear-96", "Linear-97", "LxmertAttention-33-1", "Linear-98", "LayerNorm-34", "Linear-103", "GELUActivation-337", "Linear-105", "LayerNorm-36", "Linear-107", "LxmertAttention-37-1", "Linear-110", "LayerNorm-38", "Linear-112", "Linear-113", "Linear-115", "Linear-116", "Linear-117", "LxmertAttention-41-1", "Linear-118", "LayerNorm-40", "Linear-123", "GELUActivation-385", "Linear-125", "LayerNorm-42", "Linear-127", "LxmertAttention-45-1", "Linear-130", "LayerNorm-44", "Linear-132", "Linear-133", "Linear-135", "Linear-136", "Linear-137", "LxmertAttention-49-1", "Linear-138", "LayerNorm-46", "Linear-143", "GELUActivation-433", "Linear-145", "LayerNorm-48", "Linear-147", "LxmertAttention-53-1", "Linear-150", "LayerNorm-50", "Linear-152", "Linear-153", "Linear-155", "Linear-156", "Linear-157", "LxmertAttention-57-1", "Linear-158", "LayerNorm-52", "Linear-163", "GELUActivation-481", "Linear-165", "LayerNorm-54", "Linear-167", "LxmertAttention-61-1", "Linear-170", "LayerNorm-56", "Linear-172", "Linear-173", "Linear-175", "Linear-176", "Linear-177", "LxmertAttention-65-1", "Linear-178", "LayerNorm-58", "Linear-183", "GELUActivation-529", "Linear-185", "LayerNorm-60"], "vis_cross": ["Linear-88", "Linear-89", "Linear-91", "LxmertAttention-31-1", "Linear-94", "LayerNorm-33", "Linear-99", "Linear-100", "Linear-101", "LxmertAttention-35-1", "Linear-102", "LayerNorm-35", "Linear-104", "GELUActivation-361", "Linear-106", "LayerNorm-37", "Linear-108", "Linear-109", "Linear-111", "LxmertAttention-39-1", "Linear-114", "LayerNorm-39", "Linear-119", "Linear-120", "Linear-121", "LxmertAttention-43-1", "Linear-122", "LayerNorm-41", "Linear-124", "GELUActivation-409", "Linear-126", "LayerNorm-43", "Linear-128", "Linear-129", "Linear-131", "LxmertAttention-47-1", "Linear-124", "LayerNorm-45", "Linear-129", "Linear-140", "Linear-141", "LxmertAttention-51-1", "Linear-142", "LayerNorm-47", "Linear-144", "GELUActivation-457", "Linear-146", "LayerNorm-49", "Linear-148", "Linear-149", "Linear-151", "LxmertAttention-55-1", "Linear-164", "LayerNorm-51", "Linear-149", "Linear-160", "Linear-161", "LxmertAttention-59-1", "Linear-162", "LayerNorm-53", "Linear-164", "GELUActivation-505", "Linear-166", "LayerNorm-55", "Linear-168", "Linear-169", "Linear-171", "LxmertAttention-63-1", "Linear-184", "LayerNorm-57", "Linear-169", "Linear-180", "Linear-181", "LxmertAttention-65-1", "Linear-182", "LayerNorm-59", "Linear-184", "GELUActivation-553", "Linear-186", "LayerNorm-61"], "output": ["Linear-187", "Tanh-1"]}, "visual_bert": {"output": ["Linear-74", "Tanh-1"], "encoder_1": ["Embedding-1", "Embedding-2", "Linear-1", "Embedding-4", "Embedding-5", "LayerNorm-1", "Linear-2", "Linear-3", "Linear-4", "VisualBertSelfAttention-1-1", "Linear-5", "LayerNorm-2", "Linear-6", "GELUActivation-1", "Linear-7", "LayerNorm-3", "Linear-8", "Linear-9", "Linear-10", "VisualBertSelfAttention-2-1", "Linear-11", "LayerNorm-4", "Linear-12", "GELUActivation-13", "Linear-13", "LayerNorm-5", "Linear-14", "Linear-15", "Linear-16", "VisualBertSelfAttention-3-1", "Linear-17", "LayerNorm-6"], "encoder_2": ["Linear-18", "GELUActivation-25", "Linear-19", "LayerNorm-7", "Linear-20", "Linear-21", "Linear-22", "VisualBertSelfAttention-4-1", "Linear-23", "LayerNorm-8", "Linear-24", "GELUActivation-37", "Linear-25", "LayerNorm-9", "Linear-26", "Linear-27", "Linear-28", "VisualBertSelfAttention-5-1", "Linear-29", "LayerNorm-10", "Linear-30", "GELUActivation-49", "Linear-31", "LayerNorm-11", "Linear-32", "Linear-33", "Linear-34", "VisualBertSelfAttention-6-1", "Linear-35", "LayerNorm-12", "Linear-36", "GELUActivation-61"], "encoder_3": ["Linear-37", "LayerNorm-13", "Linear-38", "Linear-39", "Linear-40", "VisualBertSelfAttention-7-1", "Linear-41", "LayerNorm-14", "Linear-42", "GELUActivation-73", "Linear-43", "LayerNorm-15", "Linear-44", "Linear-45", "Linear-46", "VisualBertSelfAttention-8-1", "Linear-47", "LayerNorm-16", "Linear-48", "GELUActivation-85", "Linear-49", "LayerNorm-17", "Linear-50", "Linear-51", "Linear-52", "VisualBertSelfAttention-9-1", "Linear-53", "LayerNorm-18", "Linear-54", "GELUActivation-97", "Linear-55"], "encoder_4": ["LayerNorm-19", "Linear-56", "Linear-57", "Linear-58", "VisualBertSelfAttention-10-1", "Linear-59", "LayerNorm-20", "Linear-60", "GELUActivation-109", "Linear-61", "LayerNorm-21", "Linear-62", "Linear-63", "Linear-64", "VisualBertSelfAttention-11-1", "Linear-65", "LayerNorm-22", "Linear-66", "GELUActivation-121", "Linear-67", "LayerNorm-23", "Linear-68", "Linear-69", "Linear-70", "VisualBertSelfAttention-12-1", "Linear-71", "LayerNorm-24", "Linear-72", "GELUActivation-133", "Linear-73", "LayerNorm-25"], "best_layer": ["LayerNorm-25"]}, "openai_clip": {"vision_output": ["LayerNorm-50", "VisionTransformer-1"], "vision_encoder_1": ["Conv2d-1", "LayerNorm-1", "LayerNorm-2", "MultiheadAttention-1-1", "LayerNorm-3", "Linear-1", "QuickGELU-1", "Linear-2", "ResidualAttentionBlock-1", "LayerNorm-4", "MultiheadAttention-2-1", "LayerNorm-5", "Linear-3", "QuickGELU-2", "Linear-4", "ResidualAttentionBlock-2", "LayerNorm-6", "MultiheadAttention-3-1", "LayerNorm-7", "Linear-5", "QuickGELU-3", "Linear-6", "ResidualAttentionBlock-3", "LayerNorm-8", "MultiheadAttention-4-1", "LayerNorm-9", "Linear-7", "QuickGELU-4", "Linear-8", "ResidualAttentionBlock-4"], "vision_encoder_2": ["LayerNorm-10", "MultiheadAttention-5-1", "LayerNorm-11", "Linear-9", "QuickGELU-5", "Linear-10", "ResidualAttentionBlock-5", "LayerNorm-12", "MultiheadAttention-6-1", "LayerNorm-13", "Linear-11", "QuickGELU-6", "Linear-12", "ResidualAttentionBlock-6", "LayerNorm-14", "MultiheadAttention-7-1", "LayerNorm-15", "Linear-13", "QuickGELU-7", "Linear-14", "ResidualAttentionBlock-7", "LayerNorm-16", "MultiheadAttention-8-1", "LayerNorm-17", "Linear-15", "QuickGELU-8", "Linear-16", "ResidualAttentionBlock-8"], "vision_encoder_3": ["LayerNorm-18", "MultiheadAttention-9-1", "LayerNorm-19", "Linear-17", "QuickGELU-9", "Linear-18", "ResidualAttentionBlock-9", "LayerNorm-20", "MultiheadAttention-10-1", "LayerNorm-21", "Linear-19", "QuickGELU-10", "Linear-20", "ResidualAttentionBlock-10", "LayerNorm-22", "MultiheadAttention-11-1", "LayerNorm-23", "Linear-21", "QuickGELU-11", "Linear-22", "ResidualAttentionBlock-11", "LayerNorm-24", "MultiheadAttention-12-1", "LayerNorm-25", "Linear-23", "QuickGELU-12", "Linear-24", "ResidualAttentionBlock-12"], "vision_encoder_4": ["LayerNorm-26", "MultiheadAttention-13-1", "LayerNorm-27", "Linear-25", "QuickGELU-13", "Linear-26", "ResidualAttentionBlock-13", "LayerNorm-28", "MultiheadAttention-14-1", "LayerNorm-29", "Linear-27", "QuickGELU-14", "Linear-28", "ResidualAttentionBlock-14", "LayerNorm-30", "MultiheadAttention-15-1", "LayerNorm-31", "Linear-29", "QuickGELU-15", "Linear-30", "ResidualAttentionBlock-15", "LayerNorm-32", "MultiheadAttention-16-1", "LayerNorm-33", "Linear-31", "QuickGELU-16", "Linear-32", "ResidualAttentionBlock-16"], "vision_encoder_5": ["LayerNorm-34", "MultiheadAttention-17-1", "LayerNorm-35", "Linear-33", "QuickGELU-17", "Linear-34", "ResidualAttentionBlock-17", "LayerNorm-36", "MultiheadAttention-18-1", "LayerNorm-37", "Linear-35", "QuickGELU-18", "Linear-36", "ResidualAttentionBlock-18", "LayerNorm-38", "MultiheadAttention-19-1", "LayerNorm-39", "Linear-37", "QuickGELU-19", "Linear-38", "ResidualAttentionBlock-19", "LayerNorm-40", "MultiheadAttention-20-1", "LayerNorm-41", "Linear-39", "QuickGELU-20", "Linear-40", "ResidualAttentionBlock-20"], "vision_encoder_6": ["LayerNorm-42", "MultiheadAttention-21-1", "LayerNorm-43", "Linear-41", "QuickGELU-21", "Linear-42", "ResidualAttentionBlock-21", "LayerNorm-44", "MultiheadAttention-22-1", "LayerNorm-45", "Linear-43", "QuickGELU-22", "Linear-44", "ResidualAttentionBlock-22", "LayerNorm-46", "MultiheadAttention-23-1", "LayerNorm-47", "Linear-45", "QuickGELU-23", "Linear-46", "ResidualAttentionBlock-23", "LayerNorm-48", "MultiheadAttention-24-1", "LayerNorm-49", "Linear-47", "QuickGELU-24", "Linear-48", "ResidualAttentionBlock-24"], "language_encoder_1": ["Embedding-1", "LayerNorm-51", "MultiheadAttention-25-1", "LayerNorm-52", "Linear-49", "QuickGELU-25", "Linear-50", "ResidualAttentionBlock-25", "LayerNorm-53", "MultiheadAttention-26-1", "LayerNorm-54", "Linear-51", "QuickGELU-26", "Linear-52", "ResidualAttentionBlock-26", "LayerNorm-55", "MultiheadAttention-27-1", "LayerNorm-56", "Linear-53", "QuickGELU-27", "Linear-54", "ResidualAttentionBlock-27", "LayerNorm-57", "MultiheadAttention-28-1", "LayerNorm-58", "Linear-55", "QuickGELU-28", "Linear-56", "ResidualAttentionBlock-28", "LayerNorm-59", "MultiheadAttention-29-1", "LayerNorm-60", "Linear-57", "QuickGELU-29", "Linear-58", "ResidualAttentionBlock-29", "LayerNorm-61", "MultiheadAttention-30-1", "LayerNorm-62", "Linear-59", "QuickGELU-30", "Linear-60", "ResidualAttentionBlock-30"], "language_encoder_2": ["LayerNorm-63", "MultiheadAttention-31-1", "LayerNorm-64", "Linear-61", "QuickGELU-31", "Linear-62", "ResidualAttentionBlock-31", "LayerNorm-65", "MultiheadAttention-32-1", "LayerNorm-66", "Linear-63", "QuickGELU-32", "Linear-64", "ResidualAttentionBlock-32", "LayerNorm-67", "MultiheadAttention-33-1", "LayerNorm-68", "Linear-65", "QuickGELU-33", "Linear-66", "ResidualAttentionBlock-33", "LayerNorm-69", "MultiheadAttention-34-1", "LayerNorm-70", "Linear-67", "QuickGELU-34", "Linear-68", "ResidualAttentionBlock-34", "LayerNorm-71", "MultiheadAttention-35-1", "LayerNorm-72", "Linear-69", "QuickGELU-35", "Linear-70", "ResidualAttentionBlock-35", "LayerNorm-73", "MultiheadAttention-36-1", "LayerNorm-74", "Linear-71", "QuickGELU-36", "Linear-72", "ResidualAttentionBlock-36", "LayerNorm-75"]}, "slip": {"vision_encoder_1": ["Conv2d-1", "Identity-1", "LayerNorm-1", "Linear-1", "Linear-2", "LayerNorm-2", "Linear-3", "GELU-1", "Linear-4", "Block-1", "LayerNorm-3", "Linear-5", "Linear-6", "LayerNorm-4", "Linear-7", "GELU-2", "Linear-8", "Block-2", "LayerNorm-5", "Linear-9", "Linear-10", "LayerNorm-6", "Linear-11", "GELU-3", "Linear-12", "Block-3", "LayerNorm-7"], "vision_encoder_2": ["Linear-13", "Linear-14", "LayerNorm-8", "Linear-15", "GELU-4", "Linear-16", "Block-4", "LayerNorm-9", "Linear-17", "Linear-18", "LayerNorm-10", "Linear-19", "GELU-5", "Linear-20", "Block-5", "LayerNorm-11", "Linear-21", "Linear-22", "LayerNorm-12", "Linear-23", "GELU-6", "Linear-24", "Block-6", "LayerNorm-13"], "vision_encoder_3": ["Linear-25", "Linear-26", "LayerNorm-14", "Linear-27", "GELU-7", "Linear-28", "Block-7", "LayerNorm-15", "Linear-29", "Linear-30", "LayerNorm-16", "Linear-31", "GELU-8", "Linear-32", "Block-8", "LayerNorm-17", "Linear-33", "Linear-34", "LayerNorm-18", "Linear-35", "GELU-9", "Linear-36", "Block-9", "LayerNorm-19"], "vision_encoder_4": ["Linear-37", "Linear-38", "LayerNorm-20", "Linear-39", "GELU-10", "Linear-40", "Block-10", "LayerNorm-21", "Linear-41", "Linear-42", "LayerNorm-22", "Linear-43", "GELU-11", "Linear-44", "Block-11", "LayerNorm-23", "Linear-45", "Linear-46", "LayerNorm-24", "Linear-47", "GELU-12", "Linear-48", "Block-12", "LayerNorm-25"], "vision_output": ["Identity-51"], "language_encoder_1": ["Embedding-1", "LayerNorm-26", "MultiheadAttention-1-1", "LayerNorm-27", "Linear-49", "QuickGELU-1", "Linear-50", "ResidualAttentionBlock-1", "LayerNorm-28", "MultiheadAttention-2-1", "LayerNorm-29", "Linear-51", "QuickGELU-2", "Linear-52", "ResidualAttentionBlock-2", "LayerNorm-30", "MultiheadAttention-3-1", "LayerNorm-31", "Linear-53", "QuickGELU-3", "Linear-54", "ResidualAttentionBlock-3", "LayerNorm-32", "MultiheadAttention-4-1", "LayerNorm-33", "Linear-55", "QuickGELU-4", "Linear-56", "ResidualAttentionBlock-4"], "language_encoder_2": ["LayerNorm-34", "MultiheadAttention-5-1", "LayerNorm-35", "Linear-57", "QuickGELU-5", "Linear-58", "ResidualAttentionBlock-5", "LayerNorm-36", "MultiheadAttention-6-1", "LayerNorm-37", "Linear-59", "QuickGELU-6", "Linear-60", "ResidualAttentionBlock-6", "LayerNorm-38", "MultiheadAttention-7-1", "LayerNorm-39", "Linear-61", "QuickGELU-7", "Linear-62", "ResidualAttentionBlock-7", "LayerNorm-40", "MultiheadAttention-8-1", "LayerNorm-41", "Linear-63", "QuickGELU-8", "Linear-64", "ResidualAttentionBlock-8"], "language_encoder_3": ["LayerNorm-42", "MultiheadAttention-9-1", "LayerNorm-43", "Linear-65", "QuickGELU-9", "Linear-66", "ResidualAttentionBlock-9", "LayerNorm-44", "MultiheadAttention-10-1", "LayerNorm-45", "Linear-67", "QuickGELU-10", "Linear-68", "ResidualAttentionBlock-10", "LayerNorm-46", "MultiheadAttention-11-1", "LayerNorm-47", "Linear-69", "QuickGELU-11", "Linear-70", "ResidualAttentionBlock-11", "LayerNorm-48", "MultiheadAttention-12-1", "LayerNorm-49", "Linear-71", "QuickGELU-12", "Linear-72", "ResidualAttentionBlock-12", "LayerNorm-50"], "best_lang_layer": ["LayerNorm-50"], "best_vis_layer": ["Identity-51"]}, "clip": {"vision_encoder_1": ["Conv2d-1", "Identity-1", "LayerNorm-1", "Linear-1", "Linear-2", "LayerNorm-2", "Linear-3", "GELU-1", "Linear-4", "Block-1", "LayerNorm-3", "Linear-5", "Linear-6", "LayerNorm-4", "Linear-7", "GELU-2", "Linear-8", "Block-2", "LayerNorm-5", "Linear-9", "Linear-10", "LayerNorm-6", "Linear-11", "GELU-3", "Linear-12", "Block-3", "LayerNorm-7"], "vision_encoder_2": ["Linear-13", "Linear-14", "LayerNorm-8", "Linear-15", "GELU-4", "Linear-16", "Block-4", "LayerNorm-9", "Linear-17", "Linear-18", "LayerNorm-10", "Linear-19", "GELU-5", "Linear-20", "Block-5", "LayerNorm-11", "Linear-21", "Linear-22", "LayerNorm-12", "Linear-23", "GELU-6", "Linear-24", "Block-6", "LayerNorm-13"], "vision_encoder_3": ["Linear-25", "Linear-26", "LayerNorm-14", "Linear-27", "GELU-7", "Linear-28", "Block-7", "LayerNorm-15", "Linear-29", "Linear-30", "LayerNorm-16", "Linear-31", "GELU-8", "Linear-32", "Block-8", "LayerNorm-17", "Linear-33", "Linear-34", "LayerNorm-18", "Linear-35", "GELU-9", "Linear-36", "Block-9", "LayerNorm-19"], "vision_encoder_4": ["Linear-37", "Linear-38", "LayerNorm-20", "Linear-39", "GELU-10", "Linear-40", "Block-10", "LayerNorm-21", "Linear-41", "Linear-42", "LayerNorm-22", "Linear-43", "GELU-11", "Linear-44", "Block-11", "LayerNorm-23", "Linear-45", "Linear-46", "LayerNorm-24", "Linear-47", "GELU-12", "Linear-48", "Block-12", "LayerNorm-25"], "vision_output": ["Identity-51"], "language_encoder_1": ["Embedding-1", "LayerNorm-26", "MultiheadAttention-1-1", "LayerNorm-27", "Linear-49", "QuickGELU-1", "Linear-50", "ResidualAttentionBlock-1", "LayerNorm-28", "MultiheadAttention-2-1", "LayerNorm-29", "Linear-51", "QuickGELU-2", "Linear-52", "ResidualAttentionBlock-2", "LayerNorm-30", "MultiheadAttention-3-1", "LayerNorm-31", "Linear-53", "QuickGELU-3", "Linear-54", "ResidualAttentionBlock-3", "LayerNorm-32", "MultiheadAttention-4-1", "LayerNorm-33", "Linear-55", "QuickGELU-4", "Linear-56", "ResidualAttentionBlock-4"], "language_encoder_2": ["LayerNorm-34", "MultiheadAttention-5-1", "LayerNorm-35", "Linear-57", "QuickGELU-5", "Linear-58", "ResidualAttentionBlock-5", "LayerNorm-36", "MultiheadAttention-6-1", "LayerNorm-37", "Linear-59", "QuickGELU-6", "Linear-60", "ResidualAttentionBlock-6", "LayerNorm-38", "MultiheadAttention-7-1", "LayerNorm-39", "Linear-61", "QuickGELU-7", "Linear-62", "ResidualAttentionBlock-7", "LayerNorm-40", "MultiheadAttention-8-1", "LayerNorm-41", "Linear-63", "QuickGELU-8", "Linear-64", "ResidualAttentionBlock-8"], "language_encoder_3": ["LayerNorm-42", "MultiheadAttention-9-1", "LayerNorm-43", "Linear-65", "QuickGELU-9", "Linear-66", "ResidualAttentionBlock-9", "LayerNorm-44", "MultiheadAttention-10-1", "LayerNorm-45", "Linear-67", "QuickGELU-10", "Linear-68", "ResidualAttentionBlock-10", "LayerNorm-46", "MultiheadAttention-11-1", "LayerNorm-47", "Linear-69", "QuickGELU-11", "Linear-70", "ResidualAttentionBlock-11", "LayerNorm-48", "MultiheadAttention-12-1", "LayerNorm-49", "Linear-71", "QuickGELU-12", "Linear-72", "ResidualAttentionBlock-12", "LayerNorm-50"], "best_lang_layer": ["LayerNorm-50"], "best_vis_layer": ["Identity-51"]}, "simclr": {"vision_encoder_1": ["Conv2d-1", "Identity-1", "LayerNorm-1", "Linear-1", "Linear-2", "LayerNorm-2", "Linear-3", "GELU-1", "Linear-4", "Block-1", "LayerNorm-3", "Linear-5", "Linear-6", "LayerNorm-4", "Linear-7", "GELU-2", "Linear-8", "Block-2", "LayerNorm-5", "Linear-9", "Linear-10", "LayerNorm-6", "Linear-11", "GELU-3", "Linear-12", "Block-3", "LayerNorm-7"], "vision_encoder_2": ["Linear-13", "Linear-14", "LayerNorm-8", "Linear-15", "GELU-4", "Linear-16", "Block-4", "LayerNorm-9", "Linear-17", "Linear-18", "LayerNorm-10", "Linear-19", "GELU-5", "Linear-20", "Block-5", "LayerNorm-11", "Linear-21", "Linear-22", "LayerNorm-12", "Linear-23", "GELU-6", "Linear-24", "Block-6", "LayerNorm-13"], "vision_encoder_3": ["Linear-25", "Linear-26", "LayerNorm-14", "Linear-27", "GELU-7", "Linear-28", "Block-7", "LayerNorm-15", "Linear-29", "Linear-30", "LayerNorm-16", "Linear-31", "GELU-8", "Linear-32", "Block-8", "LayerNorm-17", "Linear-33", "Linear-34", "LayerNorm-18", "Linear-35", "GELU-9", "Linear-36", "Block-9", "LayerNorm-19"], "vision_encoder_4": ["Linear-37", "Linear-38", "LayerNorm-20", "Linear-39", "GELU-10", "Linear-40", "Block-10", "LayerNorm-21", "Linear-41", "Linear-42", "LayerNorm-22", "Linear-43", "GELU-11", "Linear-44", "Block-11", "LayerNorm-23", "Linear-45", "Linear-46", "LayerNorm-24", "Linear-47", "GELU-12", "Linear-48", "Block-12", "LayerNorm-25"], "vision_output": ["Identity-51"], "best_layer": ["Identity-51"]}, "sbert": {"language_encoder_1": ["Embedding-1", "Embedding-2", "LayerNorm-1", "Linear-1", "Linear-2", "Linear-3", "Linear-4", "LayerNorm-2", "Linear-5", "GELUActivation-1", "Linear-6", "LayerNorm-3", "Linear-7", "Linear-8", "Linear-9", "Linear-10", "LayerNorm-4", "Linear-11", "GELUActivation-13", "Linear-12", "LayerNorm-5", "Linear-13", "Linear-14", "Linear-15", "Linear-16", "LayerNorm-6", "Linear-17", "GELUActivation-25"], "language_encoder_2": ["Linear-18", "LayerNorm-7", "Linear-19", "Linear-20", "Linear-21", "Linear-22", "LayerNorm-8", "Linear-23", "GELUActivation-37", "Linear-24", "LayerNorm-9", "Linear-25", "Linear-26", "Linear-27", "Linear-28", "LayerNorm-10", "Linear-29", "GELUActivation-49", "Linear-30", "LayerNorm-11", "Linear-31", "Linear-32", "Linear-33", "Linear-34", "LayerNorm-12", "Linear-35", "GELUActivation-61", "Linear-36"], "language_encoder_3": ["LayerNorm-13", "Linear-37", "Linear-38", "Linear-39", "Linear-40", "LayerNorm-14", "Linear-41", "GELUActivation-73", "Linear-42", "LayerNorm-15", "Linear-43", "Linear-44", "Linear-45", "Linear-46", "LayerNorm-16", "Linear-47", "GELUActivation-85", "Linear-48", "LayerNorm-17", "Linear-49", "Linear-50", "Linear-51", "Linear-52", "LayerNorm-18", "Linear-53", "GELUActivation-97", "Linear-54"], "language_encoder_4": ["LayerNorm-19", "Linear-55", "Linear-56", "Linear-57", "Linear-58", "LayerNorm-20", "Linear-59", "GELUActivation-109", "Linear-60", "LayerNorm-21", "Linear-61", "Linear-62", "Linear-63", "Linear-64", "LayerNorm-22", "Linear-65", "GELUActivation-121", "Linear-66", "LayerNorm-23", "Linear-67", "Linear-68", "Linear-69", "Linear-70", "LayerNorm-24", "Linear-71", "GELUActivation-133", "Linear-72"], "output": ["LayerNorm-25", "Linear-73", "Tanh-1"], "best_layer": ["Tanh-1"]}, "gpt2": {"language_output_1": ["Embedding-1", "LayerNorm-1", "Conv1D-1", "Conv1D-2", "LayerNorm-2", "Conv1D-3", "NewGELUActivation-1", "Conv1D-4", "GPT2Block-1-1", "LayerNorm-3", "Conv1D-5", "Conv1D-6", "LayerNorm-4", "Conv1D-7", "NewGELUActivation-49", "Conv1D-8", "GPT2Block-2-1", "LayerNorm-5", "Conv1D-9", "Conv1D-10", "LayerNorm-6", "Conv1D-11", "NewGELUActivation-97", "Conv1D-12", "GPT2Block-3-1", "LayerNorm-7", "Conv1D-13", "Conv1D-14", "LayerNorm-8", "Conv1D-15", "NewGELUActivation-145", "Conv1D-16", "GPT2Block-4-1", "LayerNorm-9", "Conv1D-17", "Conv1D-18", "LayerNorm-10", "Conv1D-19", "NewGELUActivation-193", "Conv1D-20", "GPT2Block-5-1", "LayerNorm-11", "Conv1D-21", "Conv1D-22", "LayerNorm-12", "Conv1D-23", "NewGELUActivation-241", "Conv1D-24", "GPT2Block-6-1", "LayerNorm-13", "Conv1D-25", "Conv1D-26", "LayerNorm-14", "Conv1D-27", "NewGELUActivation-289", "Conv1D-28", "GPT2Block-7-1", "LayerNorm-15", "Conv1D-29", "Conv1D-30", "LayerNorm-16", "Conv1D-31", "NewGELUActivation-337", "Conv1D-32", "GPT2Block-8-1", "LayerNorm-17", "Conv1D-33", "Conv1D-34", "LayerNorm-18", "Conv1D-35", "NewGELUActivation-385", "Conv1D-36", "GPT2Block-9-1", "LayerNorm-19", "Conv1D-37", "Conv1D-38", "LayerNorm-20", "Conv1D-39"], "language_output_2": ["NewGELUActivation-433", "Conv1D-40", "GPT2Block-10-1", "LayerNorm-21", "Conv1D-41", "Conv1D-42", "LayerNorm-22", "Conv1D-43", "NewGELUActivation-481", "Conv1D-44", "GPT2Block-11-1", "LayerNorm-23", "Conv1D-45", "Conv1D-46", "LayerNorm-24", "Conv1D-47", "NewGELUActivation-529", "Conv1D-48", "GPT2Block-12-1", "LayerNorm-25", "Conv1D-49", "Conv1D-50", "LayerNorm-26", "Conv1D-51", "NewGELUActivation-577", "Conv1D-52", "GPT2Block-13-1", "LayerNorm-27", "Conv1D-53", "Conv1D-54", "LayerNorm-28", "Conv1D-55", "NewGELUActivation-625", "Conv1D-56", "GPT2Block-14-1", "LayerNorm-29", "Conv1D-57", "Conv1D-58", "LayerNorm-30", "Conv1D-59", "NewGELUActivation-673", "Conv1D-60", "GPT2Block-15-1", "LayerNorm-31", "Conv1D-61", "Conv1D-62", "LayerNorm-32", "Conv1D-63", "NewGELUActivation-721", "Conv1D-64", "GPT2Block-16-1", "LayerNorm-33", "Conv1D-65", "Conv1D-66", "LayerNorm-34", "Conv1D-67", "NewGELUActivation-769", "Conv1D-68", "GPT2Block-17-1", "LayerNorm-35", "Conv1D-69", "Conv1D-70", "LayerNorm-36", "Conv1D-71", "NewGELUActivation-817", "Conv1D-72", "GPT2Block-18-1", "LayerNorm-37", "Conv1D-73", "Conv1D-74", "LayerNorm-38", "Conv1D-75", "NewGELUActivation-865", "Conv1D-76", "GPT2Block-19-1", "LayerNorm-39", "Conv1D-77"], "language_output_3": ["Conv1D-78", "LayerNorm-40", "Conv1D-79", "NewGELUActivation-913", "Conv1D-80", "GPT2Block-20-1", "LayerNorm-41", "Conv1D-81", "Conv1D-82", "LayerNorm-42", "Conv1D-83", "NewGELUActivation-961", "Conv1D-84", "GPT2Block-21-1", "LayerNorm-43", "Conv1D-85", "Conv1D-86", "LayerNorm-44", "Conv1D-87", "NewGELUActivation-1009", "Conv1D-88", "GPT2Block-22-1", "LayerNorm-45", "Conv1D-89", "Conv1D-90", "LayerNorm-46", "Conv1D-91", "NewGELUActivation-1057", "Conv1D-92", "GPT2Block-23-1", "LayerNorm-47", "Conv1D-93", "Conv1D-94", "LayerNorm-48", "Conv1D-95", "NewGELUActivation-1105", "Conv1D-96", "GPT2Block-24-1", "LayerNorm-49", "Conv1D-97", "Conv1D-98", "LayerNorm-50", "Conv1D-99", "NewGELUActivation-1153", "Conv1D-100", "GPT2Block-25-1", "LayerNorm-51", "Conv1D-101", "Conv1D-102", "LayerNorm-52", "Conv1D-103", "NewGELUActivation-1201", "Conv1D-104", "GPT2Block-26-1", "LayerNorm-53", "Conv1D-105", "Conv1D-106", "LayerNorm-54", "Conv1D-107", "NewGELUActivation-1249", "Conv1D-108", "GPT2Block-27-1", "LayerNorm-55", "Conv1D-109", "Conv1D-110", "LayerNorm-56", "Conv1D-111", "NewGELUActivation-1297", "Conv1D-112", "GPT2Block-28-1", "LayerNorm-57", "Conv1D-113", "Conv1D-114", "LayerNorm-58", "Conv1D-115", "NewGELUActivation-1345", "Conv1D-116"], "language_output_4": ["GPT2Block-29-1", "LayerNorm-59", "Conv1D-117", "Conv1D-118", "LayerNorm-60", "Conv1D-119", "NewGELUActivation-1393", "Conv1D-120", "GPT2Block-30-1", "LayerNorm-61", "Conv1D-121", "Conv1D-122", "LayerNorm-62", "Conv1D-123", "NewGELUActivation-1441", "Conv1D-124", "GPT2Block-31-1", "LayerNorm-63", "Conv1D-125", "Conv1D-126", "LayerNorm-64", "Conv1D-127", "NewGELUActivation-1489", "Conv1D-128", "GPT2Block-32-1", "LayerNorm-65", "Conv1D-129", "Conv1D-130", "LayerNorm-66", "Conv1D-131", "NewGELUActivation-1537", "Conv1D-132", "GPT2Block-33-1", "LayerNorm-67", "Conv1D-133", "Conv1D-134", "LayerNorm-68", "Conv1D-135", "NewGELUActivation-1585", "Conv1D-136", "GPT2Block-34-1", "LayerNorm-69", "Conv1D-137", "Conv1D-138", "LayerNorm-70", "Conv1D-139", "NewGELUActivation-1633", "Conv1D-140", "GPT2Block-35-1", "LayerNorm-71", "Conv1D-141", "Conv1D-142", "LayerNorm-72", "Conv1D-143", "NewGELUActivation-1681", "Conv1D-144", "GPT2Block-36-1", "LayerNorm-73", "Conv1D-145", "Conv1D-146", "LayerNorm-74", "Conv1D-147", "NewGELUActivation-1729", "Conv1D-148", "GPT2Block-37-1", "LayerNorm-75", "Conv1D-149", "Conv1D-150", "LayerNorm-76", "Conv1D-151", "NewGELUActivation-1777", "Conv1D-152", "GPT2Block-38-1", "LayerNorm-77", "Conv1D-153", "Conv1D-154", "LayerNorm-78"], "language_output_5": ["Conv1D-155", "NewGELUActivation-1825", "Conv1D-156", "GPT2Block-39-1", "LayerNorm-79", "Conv1D-157", "Conv1D-158", "LayerNorm-80", "Conv1D-159", "NewGELUActivation-1873", "Conv1D-160", "GPT2Block-40-1", "LayerNorm-81", "Conv1D-161", "Conv1D-162", "LayerNorm-82", "Conv1D-163", "NewGELUActivation-1921", "Conv1D-164", "GPT2Block-41-1", "LayerNorm-83", "Conv1D-165", "Conv1D-166", "LayerNorm-84", "Conv1D-167", "NewGELUActivation-1969", "Conv1D-168", "GPT2Block-42-1", "LayerNorm-85", "Conv1D-169", "Conv1D-170", "LayerNorm-86", "Conv1D-171", "NewGELUActivation-2017", "Conv1D-172", "GPT2Block-43-1", "LayerNorm-87", "Conv1D-173", "Conv1D-174", "LayerNorm-88", "Conv1D-175", "NewGELUActivation-2065", "Conv1D-176", "GPT2Block-44-1", "LayerNorm-89", "Conv1D-177", "Conv1D-178", "LayerNorm-90", "Conv1D-179", "NewGELUActivation-2113", "Conv1D-180", "GPT2Block-45-1", "LayerNorm-91", "Conv1D-181", "Conv1D-182", "LayerNorm-92", "Conv1D-183", "NewGELUActivation-2161", "Conv1D-184", "GPT2Block-46-1", "LayerNorm-93", "Conv1D-185", "Conv1D-186", "LayerNorm-94", "Conv1D-187", "NewGELUActivation-2209", "Conv1D-188", "GPT2Block-47-1", "LayerNorm-95", "Conv1D-189", "Conv1D-190", "LayerNorm-96", "Conv1D-191", "NewGELUActivation-2257", "Conv1D-192", "GPT2Block-48-1", "LayerNorm-97"]}, "funnel": {"language_encoder_1": ["Embedding-1", "LayerNorm-1", "Linear-1", "Linear-2", "Linear-3", "Linear-4", "LayerNorm-2", "Linear-5", "NewGELUActivation-1", "Linear-6", "LayerNorm-3", "Linear-7", "Linear-8", "Linear-9", "Linear-10", "LayerNorm-4", "Linear-11", "NewGELUActivation-33", "Linear-12", "LayerNorm-5", "Linear-13", "Linear-14", "Linear-15", "Linear-16", "LayerNorm-6", "Linear-17", "NewGELUActivation-65", "Linear-18", "LayerNorm-7", "Linear-19", "Linear-20", "Linear-21", "Linear-22", "LayerNorm-8", "Linear-23", "NewGELUActivation-97", "Linear-24", "LayerNorm-9", "Linear-25", "Linear-26", "Linear-27", "Linear-28", "LayerNorm-10", "Linear-29", "NewGELUActivation-129", "Linear-30", "LayerNorm-11", "Linear-31", "Linear-32", "Linear-33", "Linear-34", "LayerNorm-12", "Linear-35", "NewGELUActivation-161", "Linear-36", "LayerNorm-13", "Linear-37", "Linear-38", "Linear-39", "Linear-40", "LayerNorm-14", "Linear-41", "NewGELUActivation-193", "Linear-42", "LayerNorm-15", "Linear-43", "Linear-44", "Linear-45", "Linear-46", "LayerNorm-16", "Linear-47", "NewGELUActivation-225", "Linear-48", "LayerNorm-17", "Linear-49", "Linear-50", "Linear-51", "Linear-52", "LayerNorm-18", "Linear-53", "NewGELUActivation-257", "Linear-54", "LayerNorm-19", "Linear-55", "Linear-56", "Linear-57", "Linear-58", "LayerNorm-20", "Linear-59", "NewGELUActivation-289", "Linear-60", "LayerNorm-21", "Linear-62", "Linear-63", "Linear-181", "Linear-182", "Linear-183", "Linear-184", "LayerNorm-62", "Linear-185", "NewGELUActivation-961", "Linear-186", "LayerNorm-63", "Linear-187", "Linear-188", "Linear-189", "Linear-190", "LayerNorm-64", "Linear-191", "NewGELUActivation-993", "Linear-192", "LayerNorm-65"], "language_encoder_2": ["Linear-61", "Linear-64", "LayerNorm-22", "Linear-65", "NewGELUActivation-321", "Linear-66", "LayerNorm-23", "Linear-67", "Linear-68", "Linear-69", "Linear-70", "LayerNorm-24", "Linear-71", "NewGELUActivation-353", "Linear-72", "LayerNorm-25", "Linear-73", "Linear-74", "Linear-75", "Linear-76", "LayerNorm-26", "Linear-77", "NewGELUActivation-385", "Linear-78", "LayerNorm-27", "Linear-79", "Linear-80", "Linear-81", "Linear-82", "LayerNorm-28", "Linear-83", "NewGELUActivation-417", "Linear-84", "LayerNorm-29", "Linear-85", "Linear-86", "Linear-87", "Linear-88", "LayerNorm-30", "Linear-89", "NewGELUActivation-449", "Linear-90", "LayerNorm-31", "Linear-91", "Linear-92", "Linear-93", "Linear-94", "LayerNorm-32", "Linear-95", "NewGELUActivation-481", "Linear-96", "LayerNorm-33", "Linear-97", "Linear-98", "Linear-99", "Linear-100", "LayerNorm-34", "Linear-101", "NewGELUActivation-513", "Linear-102", "LayerNorm-35", "Linear-103", "Linear-104", "Linear-105", "Linear-106", "LayerNorm-36", "Linear-107", "NewGELUActivation-545", "Linear-108", "LayerNorm-37", "Linear-109", "Linear-110", "Linear-111", "Linear-112", "LayerNorm-38", "Linear-113", "NewGELUActivation-577", "Linear-114", "LayerNorm-39", "Linear-115", "Linear-116", "Linear-117", "Linear-118", "LayerNorm-40", "Linear-119", "NewGELUActivation-609", "Linear-120", "LayerNorm-41", "Linear-121", "Linear-122", "Linear-123", "Linear-124", "LayerNorm-42", "Linear-125", "NewGELUActivation-641", "Linear-126", "LayerNorm-43", "Linear-127", "Linear-128", "Linear-129", "Linear-130", "LayerNorm-44", "Linear-131", "NewGELUActivation-673", "Linear-132", "LayerNorm-45", "Linear-133", "Linear-134", "Linear-135", "Linear-136", "LayerNorm-46", "Linear-137", "NewGELUActivation-705", "Linear-138", "LayerNorm-47", "Linear-139", "Linear-140", "Linear-141", "Linear-142", "LayerNorm-48", "Linear-143", "NewGELUActivation-737", "Linear-144", "LayerNorm-49", "Linear-145", "Linear-146", "Linear-147", "Linear-148", "LayerNorm-50", "Linear-149", "NewGELUActivation-769", "Linear-150", "LayerNorm-51", "Linear-151", "Linear-152", "Linear-153", "Linear-154", "LayerNorm-52", "Linear-155", "NewGELUActivation-801", "Linear-156", "LayerNorm-53", "Linear-157", "Linear-158", "Linear-159", "Linear-160", "LayerNorm-54", "Linear-161", "NewGELUActivation-833", "Linear-162", "LayerNorm-55", "Linear-163", "Linear-164", "Linear-165", "Linear-166", "LayerNorm-56", "Linear-167", "NewGELUActivation-865", "Linear-168", "LayerNorm-57", "Linear-169", "Linear-170", "Linear-171", "Linear-172", "LayerNorm-58", "Linear-173", "NewGELUActivation-897", "Linear-174", "LayerNorm-59", "Linear-175", "Linear-176", "Linear-177", "Linear-178", "LayerNorm-60", "Linear-179", "NewGELUActivation-929", "Linear-180", "LayerNorm-61"]}, "beit": {"vision_encoder_1": ["Conv2d-1", "Identity-1", "LayerNorm-1", "Linear-1", "Identity-2", "LayerNorm-2", "Linear-2", "GELU-1", "Linear-3", "Identity-3", "Block-1", "LayerNorm-3", "Linear-4", "Identity-4", "LayerNorm-4", "Linear-5", "GELU-2", "Linear-6", "Identity-5", "Block-2", "LayerNorm-5", "Linear-7", "Identity-6", "LayerNorm-6", "Linear-8", "GELU-3", "Linear-9", "Identity-7"], "vision_encoder_2": ["Block-3", "LayerNorm-7", "Linear-10", "Identity-8", "LayerNorm-8", "Linear-11", "GELU-4", "Linear-12", "Identity-9", "Block-4", "LayerNorm-9", "Linear-13", "Identity-10", "LayerNorm-10", "Linear-14", "GELU-5", "Linear-15", "Identity-11", "Block-5", "LayerNorm-11", "Linear-16", "Identity-12", "LayerNorm-12", "Linear-17", "GELU-6", "Linear-18", "Identity-13", "Block-6"], "vision_encoder_3": ["LayerNorm-13", "Linear-19", "Identity-14", "LayerNorm-14", "Linear-20", "GELU-7", "Linear-21", "Identity-15", "Block-7", "LayerNorm-15", "Linear-22", "Identity-16", "LayerNorm-16", "Linear-23", "GELU-8", "Linear-24", "Identity-17", "Block-8", "LayerNorm-17", "Linear-25", "Identity-18", "LayerNorm-18", "Linear-26", "GELU-9", "Linear-27", "Identity-19", "Block-9"], "vision_encoder_4": ["LayerNorm-19", "Linear-28", "Identity-20", "LayerNorm-20", "Linear-29", "GELU-10", "Linear-30", "Identity-21", "Block-10", "LayerNorm-21", "Linear-31", "Identity-22", "LayerNorm-22", "Linear-32", "GELU-11", "Linear-33", "Identity-23", "Block-11", "LayerNorm-23", "Linear-34", "Identity-24", "LayerNorm-24", "Linear-35", "GELU-12", "Linear-36", "Identity-25", "Block-12"], "vision_output": ["LayerNorm-25"], "best_layer": ["LayerNorm-25"]}, "simcse": {"language_encoder_1": ["Embedding-1", "Embedding-2", "LayerNorm-1", "Linear-1", "Linear-2", "Linear-3", "BertSelfAttention-1-1", "Linear-4", "LayerNorm-2", "Linear-5", "GELUActivation-1", "Linear-6", "LayerNorm-3", "Linear-7", "Linear-8", "Linear-9", "BertSelfAttention-2-1", "Linear-10", "LayerNorm-4", "Linear-11", "GELUActivation-2", "Linear-12", "LayerNorm-5", "Linear-13", "Linear-14", "Linear-15", "BertSelfAttention-3-1", "Linear-16", "LayerNorm-6", "Linear-17"], "language_encoder_2": ["GELUActivation-3", "Linear-18", "LayerNorm-7", "Linear-19", "Linear-20", "Linear-21", "BertSelfAttention-4-1", "Linear-22", "LayerNorm-8", "Linear-23", "GELUActivation-4", "Linear-24", "LayerNorm-9", "Linear-25", "Linear-26", "Linear-27", "BertSelfAttention-5-1", "Linear-28", "LayerNorm-10", "Linear-29", "GELUActivation-5", "Linear-30", "LayerNorm-11", "Linear-31", "Linear-32", "Linear-33", "BertSelfAttention-6-1", "Linear-34", "LayerNorm-12", "Linear-35"], "language_encoder_3": ["GELUActivation-6", "Linear-36", "LayerNorm-13", "Linear-37", "Linear-38", "Linear-39", "BertSelfAttention-7-1", "Linear-40", "LayerNorm-14", "Linear-41", "GELUActivation-7", "Linear-42", "LayerNorm-15", "Linear-43", "Linear-44", "Linear-45", "BertSelfAttention-8-1", "Linear-46", "LayerNorm-16", "Linear-47", "GELUActivation-8", "Linear-48", "LayerNorm-17", "Linear-49", "Linear-50", "Linear-51", "BertSelfAttention-9-1", "Linear-52", "LayerNorm-18", "Linear-53"], "language_encoder_4": ["GELUActivation-9", "Linear-54", "LayerNorm-19", "Linear-55", "Linear-56", "Linear-57", "BertSelfAttention-10-1", "Linear-58", "LayerNorm-20", "Linear-59", "GELUActivation-10", "Linear-60", "LayerNorm-21", "Linear-61", "Linear-62", "Linear-63", "BertSelfAttention-11-1", "Linear-64", "LayerNorm-22", "Linear-65", "GELUActivation-11", "Linear-66", "LayerNorm-23", "Linear-67", "Linear-68", "Linear-69", "BertSelfAttention-12-1", "Linear-70", "LayerNorm-24", "Linear-71", "GELUActivation-12", "Linear-72", "LayerNorm-25"], "output": ["Linear-73", "Tanh-1"], "best_layer": ["Tanh-1"]}, "convnext": {"vision_encoder_1": ["Conv2d-1", "LayerNorm2d-1", "Conv2d-2", "LayerNorm-2", "Linear-1", "GELU-1", "Linear-2", "ConvNeXtBlock-1", "Conv2d-3", "LayerNorm-3", "Linear-3", "GELU-2", "Linear-4", "ConvNeXtBlock-2", "Conv2d-4", "LayerNorm-4", "Linear-5", "GELU-3", "Linear-6", "ConvNeXtBlock-3", "LayerNorm2d-2", "Conv2d-5", "Conv2d-6", "LayerNorm-6", "Linear-7", "GELU-4", "Linear-8", "ConvNeXtBlock-4"], "vision_encoder_2": ["Conv2d-7", "LayerNorm-7", "Linear-9", "GELU-5", "Linear-10", "ConvNeXtBlock-5", "Conv2d-8", "LayerNorm-8", "Linear-11", "GELU-6", "Linear-12", "ConvNeXtBlock-6", "LayerNorm2d-3", "Conv2d-9", "Conv2d-10", "LayerNorm-10", "Linear-13", "GELU-7", "Linear-14", "ConvNeXtBlock-7", "Conv2d-11", "LayerNorm-11", "Linear-15", "GELU-8", "Linear-16", "ConvNeXtBlock-8", "Conv2d-12", "LayerNorm-12"], "vision_encoder_3": ["Linear-17", "GELU-9", "Linear-18", "ConvNeXtBlock-9", "Conv2d-13", "LayerNorm-13", "Linear-19", "GELU-10", "Linear-20", "ConvNeXtBlock-10", "Conv2d-14", "LayerNorm-14", "Linear-21", "GELU-11", "Linear-22", "ConvNeXtBlock-11", "Conv2d-15", "LayerNorm-15", "Linear-23", "GELU-12", "Linear-24", "ConvNeXtBlock-12", "Conv2d-16", "LayerNorm-16", "Linear-25", "GELU-13", "Linear-26", "ConvNeXtBlock-13"], "vision_encoder_4": ["Conv2d-17", "LayerNorm-17", "Linear-27", "GELU-14", "Linear-28", "ConvNeXtBlock-14", "Conv2d-18", "LayerNorm-18", "Linear-29", "GELU-15", "Linear-30", "ConvNeXtBlock-15", "Conv2d-19", "LayerNorm-19", "Linear-31", "GELU-16", "Linear-32", "ConvNeXtBlock-16", "Conv2d-20", "LayerNorm-20", "Linear-33", "GELU-17", "Linear-34", "ConvNeXtBlock-17", "Conv2d-21", "LayerNorm-21", "Linear-35", "GELU-18"], "vision_encoder_5": ["Linear-36", "ConvNeXtBlock-18", "Conv2d-22", "LayerNorm-22", "Linear-37", "GELU-19", "Linear-38", "ConvNeXtBlock-19", "Conv2d-23", "LayerNorm-23", "Linear-39", "GELU-20", "Linear-40", "ConvNeXtBlock-20", "Conv2d-24", "LayerNorm-24", "Linear-41", "GELU-21", "Linear-42", "ConvNeXtBlock-21", "Conv2d-25", "LayerNorm-25", "Linear-43", "GELU-22", "Linear-44", "ConvNeXtBlock-22", "Conv2d-26", "LayerNorm-26"], "vision_encoder_6": ["Linear-45", "GELU-23", "Linear-46", "ConvNeXtBlock-23", "Conv2d-27", "LayerNorm-27", "Linear-47", "GELU-24", "Linear-48", "ConvNeXtBlock-24", "Conv2d-28", "LayerNorm-28", "Linear-49", "GELU-25", "Linear-50", "ConvNeXtBlock-25", "Conv2d-29", "LayerNorm-29", "Linear-51", "GELU-26", "Linear-52", "ConvNeXtBlock-26", "Conv2d-30", "LayerNorm-30", "Linear-53", "GELU-27", "Linear-54", "ConvNeXtBlock-27"], "vision_encoder_7": ["Conv2d-31", "LayerNorm-31", "Linear-55", "GELU-28", "Linear-56", "ConvNeXtBlock-28", "Conv2d-32", "LayerNorm-32", "Linear-57", "GELU-29", "Linear-58", "ConvNeXtBlock-29", "Conv2d-33", "LayerNorm-33", "Linear-59", "GELU-30", "Linear-60", "ConvNeXtBlock-30", "Conv2d-34", "LayerNorm-34", "Linear-61", "GELU-31", "Linear-62", "ConvNeXtBlock-31", "Conv2d-35", "LayerNorm-35", "Linear-63", "GELU-32"], "vision_encoder_8": ["Linear-64", "ConvNeXtBlock-32", "Conv2d-36", "LayerNorm-36", "Linear-65", "GELU-33", "Linear-66", "ConvNeXtBlock-33", "LayerNorm2d-4", "Conv2d-37", "Conv2d-38", "LayerNorm-38", "Linear-67", "GELU-34", "Linear-68", "ConvNeXtBlock-34", "Conv2d-39", "LayerNorm-39", "Linear-69", "GELU-35", "Linear-70", "ConvNeXtBlock-35", "Conv2d-40", "LayerNorm-40", "Linear-71", "GELU-36", "Linear-72", "ConvNeXtBlock-36"], "output": ["AdaptiveAvgPool2d-1", "LayerNorm2d-5"], "best_layer": ["LayerNorm2d-5"]}, "flava": {"encoder_1": ["Conv2d-1", "PatchEmbeddings-1", "LayerNorm-1", "Linear-1", "Linear-2", "Linear-3", "FlavaSelfAttention-1-1", "Linear-4", "LayerNorm-2", "Linear-5", "GELUActivation-1", "Linear-6", "FlavaOutput-1", "LayerNorm-3", "Linear-7", "Linear-8", "Linear-9", "FlavaSelfAttention-2-1", "Linear-10", "LayerNorm-4", "Linear-11", "GELUActivation-2", "Linear-12", "FlavaOutput-2", "LayerNorm-5", "Linear-13", "Linear-14", "Linear-15", "FlavaSelfAttention-3-1", "Linear-16", "LayerNorm-6", "Linear-17", "GELUActivation-3", "Linear-18", "FlavaOutput-3", "LayerNorm-7", "Linear-19", "Linear-20", "Linear-21", "FlavaSelfAttention-4-1", "Linear-22", "LayerNorm-8", "Linear-23", "GELUActivation-4", "Linear-24", "FlavaOutput-4", "LayerNorm-9", "Linear-25", "Linear-26"], "encoder_2": ["Linear-27", "FlavaSelfAttention-5-1", "Linear-28", "LayerNorm-10", "Linear-29", "GELUActivation-5", "Linear-30", "FlavaOutput-5", "LayerNorm-11", "Linear-31", "Linear-32", "Linear-33", "FlavaSelfAttention-6-1", "Linear-34", "LayerNorm-12", "Linear-35", "GELUActivation-6", "Linear-36", "FlavaOutput-6", "LayerNorm-13", "Linear-37", "Linear-38", "Linear-39", "FlavaSelfAttention-7-1", "Linear-40", "LayerNorm-14", "Linear-41", "GELUActivation-7", "Linear-42", "FlavaOutput-7", "LayerNorm-15", "Linear-43", "Linear-44", "Linear-45", "FlavaSelfAttention-8-1", "Linear-46", "LayerNorm-16", "Linear-47", "GELUActivation-8", "Linear-48", "FlavaOutput-8", "LayerNorm-17", "Linear-49", "Linear-50", "Linear-51", "FlavaSelfAttention-9-1", "Linear-52", "LayerNorm-18", "Linear-53"], "encoder_3": ["GELUActivation-9", "Linear-54", "FlavaOutput-9", "LayerNorm-19", "Linear-55", "Linear-56", "Linear-57", "FlavaSelfAttention-10-1", "Linear-58", "LayerNorm-20", "Linear-59", "GELUActivation-10", "Linear-60", "FlavaOutput-10", "LayerNorm-21", "Linear-61", "Linear-62", "Linear-63", "FlavaSelfAttention-11-1", "Linear-64", "LayerNorm-22", "Linear-65", "GELUActivation-11", "Linear-66", "FlavaOutput-11", "LayerNorm-23", "Linear-67", "Linear-68", "Linear-69", "FlavaSelfAttention-12-1", "Linear-70", "LayerNorm-24", "Linear-71", "GELUActivation-12", "Linear-72", "FlavaOutput-12", "LayerNorm-25", "Linear-73", "Linear-74", "Embedding-3", "Embedding-4", "Embedding-5", "LayerNorm-26", "LayerNorm-27", "Linear-75", "Linear-76", "Linear-77", "FlavaSelfAttention-13-1", "Linear-78"], "encoder_4": ["LayerNorm-28", "Linear-79", "GELUActivation-13", "Linear-80", "FlavaOutput-13", "LayerNorm-29", "Linear-81", "Linear-82", "Linear-83", "FlavaSelfAttention-14-1", "Linear-84", "LayerNorm-30", "Linear-85", "GELUActivation-14", "Linear-86", "FlavaOutput-14", "LayerNorm-31", "Linear-87", "Linear-88", "Linear-89", "FlavaSelfAttention-15-1", "Linear-90", "LayerNorm-32", "Linear-91", "GELUActivation-15", "Linear-92", "FlavaOutput-15", "LayerNorm-33", "Linear-93", "Linear-94", "Linear-95", "FlavaSelfAttention-16-1", "Linear-96", "LayerNorm-34", "Linear-97", "GELUActivation-16", "Linear-98", "FlavaOutput-16", "LayerNorm-35", "Linear-99", "Linear-100", "Linear-101", "FlavaSelfAttention-17-1", "Linear-102", "LayerNorm-36", "Linear-103", "GELUActivation-17", "Linear-104", "FlavaOutput-17"], "encoder_5": ["LayerNorm-37", "Linear-105", "Linear-106", "Linear-107", "FlavaSelfAttention-18-1", "Linear-108", "LayerNorm-38", "Linear-109", "GELUActivation-18", "Linear-110", "FlavaOutput-18", "LayerNorm-39", "Linear-111", "Linear-112", "Linear-113", "FlavaSelfAttention-19-1", "Linear-114", "LayerNorm-40", "Linear-115", "GELUActivation-19", "Linear-116", "FlavaOutput-19", "LayerNorm-41", "Linear-117", "Linear-118", "Linear-119", "FlavaSelfAttention-20-1", "Linear-120", "LayerNorm-42", "Linear-121", "GELUActivation-20", "Linear-122", "FlavaOutput-20", "LayerNorm-43", "Linear-123", "Linear-124", "Linear-125", "FlavaSelfAttention-21-1", "Linear-126", "LayerNorm-44", "Linear-127", "GELUActivation-21", "Linear-128", "FlavaOutput-21", "LayerNorm-45", "Linear-129", "Linear-130", "Linear-131", "FlavaSelfAttention-22-1"], "encoder_6": ["Linear-132", "LayerNorm-46", "Linear-133", "GELUActivation-22", "Linear-134", "FlavaOutput-22", "LayerNorm-47", "Linear-135", "Linear-136", "Linear-137", "FlavaSelfAttention-23-1", "Linear-138", "LayerNorm-48", "Linear-139", "GELUActivation-23", "Linear-140", "FlavaOutput-23", "LayerNorm-49", "Linear-141", "Linear-142", "Linear-143", "FlavaSelfAttention-24-1", "Linear-144", "LayerNorm-50", "Linear-145", "GELUActivation-24", "Linear-146", "FlavaOutput-24", "LayerNorm-51", "Linear-147", "Linear-148", "LayerNorm-52", "Linear-149", "Linear-150", "Linear-151", "FlavaSelfAttention-25-1", "Linear-152", "LayerNorm-53", "Linear-153", "GELUActivation-25", "Linear-154", "FlavaOutput-25", "LayerNorm-54", "Linear-155", "Linear-156", "Linear-157", "FlavaSelfAttention-26-1", "Linear-158", "LayerNorm-55"], "encoder_7": ["Linear-159", "GELUActivation-26", "Linear-160", "FlavaOutput-26", "LayerNorm-56", "Linear-161", "Linear-162", "Linear-163", "FlavaSelfAttention-27-1", "Linear-164", "LayerNorm-57", "Linear-165", "GELUActivation-27", "Linear-166", "FlavaOutput-27", "LayerNorm-58", "Linear-167", "Linear-168", "Linear-169", "FlavaSelfAttention-28-1", "Linear-170", "LayerNorm-59", "Linear-171", "GELUActivation-28", "Linear-172", "FlavaOutput-28", "LayerNorm-60", "Linear-173", "Linear-174", "Linear-175", "FlavaSelfAttention-29-1", "Linear-176", "LayerNorm-61", "Linear-177", "GELUActivation-29", "Linear-178", "FlavaOutput-29", "LayerNorm-62", "Linear-179", "Linear-180", "Linear-181", "FlavaSelfAttention-30-1", "Linear-182", "LayerNorm-63", "Linear-183", "GELUActivation-30", "Linear-184", "FlavaOutput-30", "LayerNorm-64"], "output": ["Linear-185"], "best_layer": ["Linear-185"]}, "albef": {"encoder_1": ["Conv2d-1", "Identity-1", "LayerNorm-1", "Linear-1", "Linear-2", "LayerNorm-2", "Linear-3", "GELU-1", "Linear-4", "Block-1", "LayerNorm-3", "Linear-5", "Linear-6", "LayerNorm-4", "Linear-7", "GELU-2", "Linear-8", "Block-2", "LayerNorm-5", "Linear-9", "Linear-10", "LayerNorm-6", "Linear-11", "GELU-3", "Linear-12", "Block-3", "LayerNorm-7", "Linear-13", "Linear-14", "LayerNorm-8", "Linear-15", "GELU-4", "Linear-16", "Block-4", "LayerNorm-9", "Linear-17", "Linear-18"], "encoder_2": ["LayerNorm-10", "Linear-19", "GELU-5", "Linear-20", "Block-5", "LayerNorm-11", "Linear-21", "Linear-22", "LayerNorm-12", "Linear-23", "GELU-6", "Linear-24", "Block-6", "LayerNorm-13", "Linear-25", "Linear-26", "LayerNorm-14", "Linear-27", "GELU-7", "Linear-28", "Block-7", "LayerNorm-15", "Linear-29", "Linear-30", "LayerNorm-16", "Linear-31", "GELU-8", "Linear-32", "Block-8", "LayerNorm-17", "Linear-33", "Linear-34", "LayerNorm-18", "Linear-35", "GELU-9", "Linear-36", "Block-9"], "encoder_3": ["LayerNorm-19", "Linear-37", "Linear-38", "LayerNorm-20", "Linear-39", "GELU-10", "Linear-40", "Block-10", "LayerNorm-21", "Linear-41", "Linear-42", "LayerNorm-22", "Linear-43", "GELU-11", "Linear-44", "Block-11", "LayerNorm-23", "Linear-45", "Linear-46", "LayerNorm-24", "Linear-47", "GELU-12", "Linear-48", "Block-12", "LayerNorm-25", "Linear-49", "Embedding-1", "LayerNorm-26", "Linear-50", "Linear-51", "Linear-52", "BertSelfAttention-1-1", "Linear-53", "LayerNorm-27", "Linear-54", "GELUActivation-1", "Linear-55"], "encoder_4": ["LayerNorm-28", "Linear-56", "Linear-57", "Linear-58", "BertSelfAttention-2-1", "Linear-59", "LayerNorm-29", "Linear-60", "GELUActivation-2", "Linear-61", "LayerNorm-30", "Linear-62", "Linear-63", "Linear-64", "BertSelfAttention-3-1", "Linear-65", "LayerNorm-31", "Linear-66", "GELUActivation-3", "Linear-67", "LayerNorm-32", "Linear-68", "Linear-69", "Linear-70", "BertSelfAttention-4-1", "Linear-71", "LayerNorm-33", "Linear-72", "GELUActivation-4", "Linear-73", "LayerNorm-34", "Linear-74", "Linear-75", "Linear-76", "BertSelfAttention-5-1", "Linear-77", "LayerNorm-35"], "encoder_5": ["Linear-78", "GELUActivation-5", "Linear-79", "LayerNorm-36", "Linear-80", "Linear-81", "Linear-82", "BertSelfAttention-6-1", "Linear-83", "LayerNorm-37", "Linear-84", "GELUActivation-6", "Linear-85", "LayerNorm-38", "Linear-86", "Linear-87", "Linear-88", "Linear-89", "BertSelfAttention-7-1", "Linear-90", "LayerNorm-39", "Linear-91", "Linear-92", "Linear-93", "BertSelfAttention-8-1", "Linear-94", "LayerNorm-40", "Linear-95", "GELUActivation-7", "Linear-96", "LayerNorm-41", "Linear-97", "Linear-98", "Linear-99", "BertSelfAttention-9-1", "Linear-100", "LayerNorm-42"], "encoder_6": ["Linear-101", "Linear-102", "Linear-103", "BertSelfAttention-10-1", "Linear-104", "LayerNorm-43", "Linear-105", "GELUActivation-8", "Linear-106", "LayerNorm-44", "Linear-107", "Linear-108", "Linear-109", "BertSelfAttention-11-1", "Linear-110", "LayerNorm-45", "Linear-111", "Linear-112", "Linear-113", "BertSelfAttention-12-1", "Linear-114", "LayerNorm-46", "Linear-115", "GELUActivation-9", "Linear-116", "LayerNorm-47", "Linear-117", "Linear-118", "Linear-119", "BertSelfAttention-13-1", "Linear-120", "LayerNorm-48", "Linear-121", "Linear-122", "Linear-123", "BertSelfAttention-14-1", "Linear-124"], "encoder_7": ["LayerNorm-49", "Linear-125", "GELUActivation-10", "Linear-126", "LayerNorm-50", "Linear-127", "Linear-128", "Linear-129", "BertSelfAttention-15-1", "Linear-130", "LayerNorm-51", "Linear-131", "Linear-132", "Linear-133", "BertSelfAttention-16-1", "Linear-134", "LayerNorm-52", "Linear-135", "GELUActivation-11", "Linear-136", "LayerNorm-53", "Linear-137", "Linear-138", "Linear-139", "BertSelfAttention-17-1", "Linear-140", "LayerNorm-54", "Linear-141", "Linear-142", "Linear-143", "BertSelfAttention-18-1", "Linear-144", "LayerNorm-55", "Linear-145", "GELUActivation-12", "Linear-146", "LayerNorm-56"], "best_layer": ["LayerNorm-56"]}, "blip": {"encoder_1": ["Conv2d-1", "Identity-1", "LayerNorm-1", "Linear-1", "Linear-2", "LayerNorm-2", "Linear-3", "GELU-1", "Linear-4", "Block-1", "LayerNorm-3", "Linear-5", "Linear-6", "LayerNorm-4", "Linear-7", "GELU-2", "Linear-8", "Block-2", "LayerNorm-5", "Linear-9", "Linear-10", "LayerNorm-6", "Linear-11", "GELU-3", "Linear-12", "Block-3", "LayerNorm-7", "Linear-13", "Linear-14", "LayerNorm-8", "Linear-15", "GELU-4", "Linear-16", "Block-4", "LayerNorm-9", "Linear-17", "Linear-18", "LayerNorm-10", "Linear-19", "GELU-5", "Linear-20", "Block-5"], "encoder_2": ["LayerNorm-11", "Linear-21", "Linear-22", "LayerNorm-12", "Linear-23", "GELU-6", "Linear-24", "Block-6", "LayerNorm-13", "Linear-25", "Linear-26", "LayerNorm-14", "Linear-27", "GELU-7", "Linear-28", "Block-7", "LayerNorm-15", "Linear-29", "Linear-30", "LayerNorm-16", "Linear-31", "GELU-8", "Linear-32", "Block-8", "LayerNorm-17", "Linear-33", "Linear-34", "LayerNorm-18", "Linear-35", "GELU-9", "Linear-36", "Block-9", "LayerNorm-19", "Linear-37", "Linear-38", "LayerNorm-20", "Linear-39", "GELU-10", "Linear-40", "Block-10", "LayerNorm-21", "Linear-41"], "encoder_3": ["Linear-42", "LayerNorm-22", "Linear-43", "GELU-11", "Linear-44", "Block-11", "LayerNorm-23", "Linear-45", "Linear-46", "LayerNorm-24", "Linear-47", "GELU-12", "Linear-48", "Block-12", "LayerNorm-25", "Embedding-1", "LayerNorm-26", "Linear-49", "Linear-50", "Linear-51", "BertSelfAttention-1-1", "Linear-52", "LayerNorm-27", "Linear-53", "Linear-54", "Linear-55", "BertSelfAttention-2-1", "Linear-56", "LayerNorm-28", "Linear-57", "GELUActivation-1", "Linear-58", "LayerNorm-29", "Linear-59", "Linear-60", "Linear-61", "BertSelfAttention-3-1", "Linear-62", "LayerNorm-30", "Linear-63", "Linear-64", "Linear-65"], "encoder_4": ["BertSelfAttention-4-1", "Linear-66", "LayerNorm-31", "Linear-67", "GELUActivation-2", "Linear-68", "LayerNorm-32", "Linear-69", "Linear-70", "Linear-71", "BertSelfAttention-5-1", "Linear-72", "LayerNorm-33", "Linear-73", "Linear-74", "Linear-75", "BertSelfAttention-6-1", "Linear-76", "LayerNorm-34", "Linear-77", "GELUActivation-3", "Linear-78", "LayerNorm-35", "Linear-79", "Linear-80", "Linear-81", "BertSelfAttention-7-1", "Linear-82", "LayerNorm-36", "Linear-83", "Linear-84", "Linear-85", "BertSelfAttention-8-1", "Linear-86", "LayerNorm-37", "Linear-87", "GELUActivation-4", "Linear-88", "LayerNorm-38", "Linear-89", "Linear-90", "Linear-91"], "encoder_5": ["BertSelfAttention-9-1", "Linear-92", "LayerNorm-39", "Linear-93", "Linear-94", "Linear-95", "BertSelfAttention-10-1", "Linear-96", "LayerNorm-40", "Linear-97", "GELUActivation-5", "Linear-98", "LayerNorm-41", "Linear-99", "Linear-100", "Linear-101", "BertSelfAttention-11-1", "Linear-102", "LayerNorm-42", "Linear-103", "Linear-104", "Linear-105", "BertSelfAttention-12-1", "Linear-106", "LayerNorm-43", "Linear-107", "GELUActivation-6", "Linear-108", "LayerNorm-44", "Linear-109", "Linear-110", "Linear-111", "BertSelfAttention-13-1", "Linear-112", "LayerNorm-45", "Linear-113", "Linear-114", "Linear-115", "BertSelfAttention-14-1", "Linear-116", "LayerNorm-46", "Linear-117"], "encoder_6": ["GELUActivation-7", "Linear-118", "LayerNorm-47", "Linear-119", "Linear-120", "Linear-121", "BertSelfAttention-15-1", "Linear-122", "LayerNorm-48", "Linear-123", "Linear-124", "Linear-125", "BertSelfAttention-16-1", "Linear-126", "LayerNorm-49", "Linear-127", "GELUActivation-8", "Linear-128", "LayerNorm-50", "Linear-129", "Linear-130", "Linear-131", "BertSelfAttention-17-1", "Linear-132", "LayerNorm-51", "Linear-133", "Linear-134", "Linear-135", "BertSelfAttention-18-1", "Linear-136", "LayerNorm-52", "Linear-137", "GELUActivation-9", "Linear-138", "LayerNorm-53", "Linear-139", "Linear-140", "Linear-141", "BertSelfAttention-19-1", "Linear-142", "LayerNorm-54", "Linear-143"], "encoder_7": ["Linear-144", "Linear-145", "BertSelfAttention-20-1", "Linear-146", "LayerNorm-55", "Linear-147", "GELUActivation-10", "Linear-148", "LayerNorm-56", "Linear-149", "Linear-150", "Linear-151", "BertSelfAttention-21-1", "Linear-152", "LayerNorm-57", "Linear-153", "Linear-154", "Linear-155", "BertSelfAttention-22-1", "Linear-156", "LayerNorm-58", "Linear-157", "GELUActivation-11", "Linear-158", "LayerNorm-59", "Linear-159", "Linear-160", "Linear-161", "BertSelfAttention-23-1", "Linear-162", "LayerNorm-60", "Linear-163", "Linear-164", "Linear-165", "BertSelfAttention-24-1", "Linear-166", "LayerNorm-61", "Linear-167", "GELUActivation-12", "Linear-168", "LayerNorm-62"], "best_layer": ["LayerNorm-62"]}}