key = 'roberta.embeddings.word_embeddings.weight':
value.shape = torch.Size([50265, 768]).
key = 'roberta.embeddings.position_embeddings.weight':
value.shape = torch.Size([514, 768]).
key = 'roberta.embeddings.token_type_embeddings.weight':
value.shape = torch.Size([1, 768]).
key = 'roberta.embeddings.LayerNorm.weight':
value.shape = torch.Size([768]).
key = 'roberta.embeddings.LayerNorm.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.0.attention.self.query.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.0.attention.self.query.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.0.attention.self.key.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.0.attention.self.key.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.0.attention.self.value.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.0.attention.self.value.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.0.attention.output.dense.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.0.attention.output.dense.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.0.attention.output.LayerNorm.weight':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.0.attention.output.LayerNorm.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.0.intermediate.dense.weight':
value.shape = torch.Size([3072, 768]).
key = 'roberta.encoder.layer.0.intermediate.dense.bias':
value.shape = torch.Size([3072]).
key = 'roberta.encoder.layer.0.output.dense.weight':
value.shape = torch.Size([768, 3072]).
key = 'roberta.encoder.layer.0.output.dense.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.0.output.LayerNorm.weight':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.0.output.LayerNorm.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.1.attention.self.query.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.1.attention.self.query.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.1.attention.self.key.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.1.attention.self.key.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.1.attention.self.value.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.1.attention.self.value.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.1.attention.output.dense.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.1.attention.output.dense.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.1.attention.output.LayerNorm.weight':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.1.attention.output.LayerNorm.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.1.intermediate.dense.weight':
value.shape = torch.Size([3072, 768]).
key = 'roberta.encoder.layer.1.intermediate.dense.bias':
value.shape = torch.Size([3072]).
key = 'roberta.encoder.layer.1.output.dense.weight':
value.shape = torch.Size([768, 3072]).
key = 'roberta.encoder.layer.1.output.dense.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.1.output.LayerNorm.weight':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.1.output.LayerNorm.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.2.attention.self.query.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.2.attention.self.query.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.2.attention.self.key.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.2.attention.self.key.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.2.attention.self.value.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.2.attention.self.value.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.2.attention.output.dense.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.2.attention.output.dense.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.2.attention.output.LayerNorm.weight':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.2.attention.output.LayerNorm.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.2.intermediate.dense.weight':
value.shape = torch.Size([3072, 768]).
key = 'roberta.encoder.layer.2.intermediate.dense.bias':
value.shape = torch.Size([3072]).
key = 'roberta.encoder.layer.2.output.dense.weight':
value.shape = torch.Size([768, 3072]).
key = 'roberta.encoder.layer.2.output.dense.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.2.output.LayerNorm.weight':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.2.output.LayerNorm.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.3.attention.self.query.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.3.attention.self.query.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.3.attention.self.key.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.3.attention.self.key.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.3.attention.self.value.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.3.attention.self.value.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.3.attention.output.dense.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.3.attention.output.dense.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.3.attention.output.LayerNorm.weight':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.3.attention.output.LayerNorm.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.3.intermediate.dense.weight':
value.shape = torch.Size([3072, 768]).
key = 'roberta.encoder.layer.3.intermediate.dense.bias':
value.shape = torch.Size([3072]).
key = 'roberta.encoder.layer.3.output.dense.weight':
value.shape = torch.Size([768, 3072]).
key = 'roberta.encoder.layer.3.output.dense.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.3.output.LayerNorm.weight':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.3.output.LayerNorm.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.4.attention.self.query.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.4.attention.self.query.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.4.attention.self.key.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.4.attention.self.key.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.4.attention.self.value.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.4.attention.self.value.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.4.attention.output.dense.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.4.attention.output.dense.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.4.attention.output.LayerNorm.weight':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.4.attention.output.LayerNorm.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.4.intermediate.dense.weight':
value.shape = torch.Size([3072, 768]).
key = 'roberta.encoder.layer.4.intermediate.dense.bias':
value.shape = torch.Size([3072]).
key = 'roberta.encoder.layer.4.output.dense.weight':
value.shape = torch.Size([768, 3072]).
key = 'roberta.encoder.layer.4.output.dense.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.4.output.LayerNorm.weight':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.4.output.LayerNorm.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.5.attention.self.query.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.5.attention.self.query.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.5.attention.self.key.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.5.attention.self.key.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.5.attention.self.value.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.5.attention.self.value.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.5.attention.output.dense.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.5.attention.output.dense.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.5.attention.output.LayerNorm.weight':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.5.attention.output.LayerNorm.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.5.intermediate.dense.weight':
value.shape = torch.Size([3072, 768]).
key = 'roberta.encoder.layer.5.intermediate.dense.bias':
value.shape = torch.Size([3072]).
key = 'roberta.encoder.layer.5.output.dense.weight':
value.shape = torch.Size([768, 3072]).
key = 'roberta.encoder.layer.5.output.dense.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.5.output.LayerNorm.weight':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.5.output.LayerNorm.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.6.attention.self.query.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.6.attention.self.query.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.6.attention.self.key.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.6.attention.self.key.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.6.attention.self.value.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.6.attention.self.value.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.6.attention.output.dense.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.6.attention.output.dense.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.6.attention.output.LayerNorm.weight':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.6.attention.output.LayerNorm.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.6.intermediate.dense.weight':
value.shape = torch.Size([3072, 768]).
key = 'roberta.encoder.layer.6.intermediate.dense.bias':
value.shape = torch.Size([3072]).
key = 'roberta.encoder.layer.6.output.dense.weight':
value.shape = torch.Size([768, 3072]).
key = 'roberta.encoder.layer.6.output.dense.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.6.output.LayerNorm.weight':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.6.output.LayerNorm.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.7.attention.self.query.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.7.attention.self.query.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.7.attention.self.key.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.7.attention.self.key.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.7.attention.self.value.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.7.attention.self.value.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.7.attention.output.dense.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.7.attention.output.dense.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.7.attention.output.LayerNorm.weight':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.7.attention.output.LayerNorm.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.7.intermediate.dense.weight':
value.shape = torch.Size([3072, 768]).
key = 'roberta.encoder.layer.7.intermediate.dense.bias':
value.shape = torch.Size([3072]).
key = 'roberta.encoder.layer.7.output.dense.weight':
value.shape = torch.Size([768, 3072]).
key = 'roberta.encoder.layer.7.output.dense.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.7.output.LayerNorm.weight':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.7.output.LayerNorm.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.8.attention.self.query.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.8.attention.self.query.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.8.attention.self.key.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.8.attention.self.key.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.8.attention.self.value.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.8.attention.self.value.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.8.attention.output.dense.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.8.attention.output.dense.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.8.attention.output.LayerNorm.weight':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.8.attention.output.LayerNorm.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.8.intermediate.dense.weight':
value.shape = torch.Size([3072, 768]).
key = 'roberta.encoder.layer.8.intermediate.dense.bias':
value.shape = torch.Size([3072]).
key = 'roberta.encoder.layer.8.output.dense.weight':
value.shape = torch.Size([768, 3072]).
key = 'roberta.encoder.layer.8.output.dense.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.8.output.LayerNorm.weight':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.8.output.LayerNorm.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.9.attention.self.query.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.9.attention.self.query.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.9.attention.self.key.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.9.attention.self.key.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.9.attention.self.value.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.9.attention.self.value.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.9.attention.output.dense.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.9.attention.output.dense.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.9.attention.output.LayerNorm.weight':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.9.attention.output.LayerNorm.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.9.intermediate.dense.weight':
value.shape = torch.Size([3072, 768]).
key = 'roberta.encoder.layer.9.intermediate.dense.bias':
value.shape = torch.Size([3072]).
key = 'roberta.encoder.layer.9.output.dense.weight':
value.shape = torch.Size([768, 3072]).
key = 'roberta.encoder.layer.9.output.dense.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.9.output.LayerNorm.weight':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.9.output.LayerNorm.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.10.attention.self.query.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.10.attention.self.query.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.10.attention.self.key.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.10.attention.self.key.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.10.attention.self.value.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.10.attention.self.value.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.10.attention.output.dense.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.10.attention.output.dense.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.10.attention.output.LayerNorm.weight':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.10.attention.output.LayerNorm.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.10.intermediate.dense.weight':
value.shape = torch.Size([3072, 768]).
key = 'roberta.encoder.layer.10.intermediate.dense.bias':
value.shape = torch.Size([3072]).
key = 'roberta.encoder.layer.10.output.dense.weight':
value.shape = torch.Size([768, 3072]).
key = 'roberta.encoder.layer.10.output.dense.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.10.output.LayerNorm.weight':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.10.output.LayerNorm.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.11.attention.self.query.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.11.attention.self.query.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.11.attention.self.key.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.11.attention.self.key.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.11.attention.self.value.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.11.attention.self.value.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.11.attention.output.dense.weight':
value.shape = torch.Size([768, 768]).
key = 'roberta.encoder.layer.11.attention.output.dense.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.11.attention.output.LayerNorm.weight':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.11.attention.output.LayerNorm.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.11.intermediate.dense.weight':
value.shape = torch.Size([3072, 768]).
key = 'roberta.encoder.layer.11.intermediate.dense.bias':
value.shape = torch.Size([3072]).
key = 'roberta.encoder.layer.11.output.dense.weight':
value.shape = torch.Size([768, 3072]).
key = 'roberta.encoder.layer.11.output.dense.bias':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.11.output.LayerNorm.weight':
value.shape = torch.Size([768]).
key = 'roberta.encoder.layer.11.output.LayerNorm.bias':
value.shape = torch.Size([768]).
key = 'lm_head.bias':
value.shape = torch.Size([50265]).
key = 'lm_head.dense.weight':
value.shape = torch.Size([768, 768]).
key = 'lm_head.dense.bias':
value.shape = torch.Size([768]).
key = 'lm_head.layer_norm.weight':
value.shape = torch.Size([768]).
key = 'lm_head.layer_norm.bias':
value.shape = torch.Size([768]).
key = 'lm_head.decoder.weight':
value.shape = torch.Size([50265, 768]).
key = 'lm_head.decoder.bias':
value.shape = torch.Size([50265]).