[ ('config_version', 5),
  # data
  ('file_encoding', 'utf-8'),
  ('word_based', True),
  ('episodic', False),
  # model
  ('num_params', -1),
  ('share_input_and_output_embeddings', True),
  ('input_embedding_size', 256),
  ('output_embedding_size', -1),
  ('input_embedding_ratio', 1.0),
  ('output_embedding_ratio', -1.0),
  ('mos_num_components', 0),
  ('token_dropout', 0.0),
  ('embedding_dropout', 0.0),
  ('input_dropout', 0.42476247624762475),
  ('output_dropout', 0.45676567656765676),
  ('downprojected_output_dropout', -1.0),
  ('shared_mask_dropout', False),
  ('embed_once', True),
  # cell
  ('model', 'lstm'),
  ('num_layers', 1),
  ('residual_connections', False),
  ('lstm_skip_connection', False),
  ('feature_mask_rounds', 0),
  ('feature_mask_rank', 0),
  ('feature_mask', False),
  ('sparsity_ratio', -1.0),
  ('overlay_rank', -1),
  ('hidden_size', [256]),
  ('hidden_size_multiplier', 1.0),
  ('layer_norm', False),
  ('activation_fn', 'tf.tanh'),
  ('tie_forget_and_input_gates', False),
  ('cap_input_gate', False),
  ('trainable_initial_state', False),
  ('inter_layer_dropout', 0.735993599359936),
  ('state_dropout', 0.19633963396339635),
  ('state_dropout_flip_rate', 0.0),
  ('update_dropout', 0.0),
  ('cell_clip', -1.0),
  # objective
  ('model_average', 'arithmetic'),
  ('num_training_samples', 1),
  ('l2_penalty', 0.0003314731473147315),
  ('l1_penalty', 0.0),
  ('activation_norm_penalty', 0.0),
  ('drop_state_probability', 0.0),
  # initialization
  ('embedding_init_factor', 1.0),
  ('scale_input_embeddings', False),
  ('cell_init_factor', 1.0),
  ('forget_bias', 1.0),
  ('output_init_factor', 1.0),
  # schedule
  ('steps_per_turn', 200),
  ('print_training_stats_every_num_steps', 200),
  ('turns', 1000),
  # optimization
  ('optimizer_type', 'adam'),
  ('rmsprop_beta2', 0.999),
  ('rmsprop_epsilon', 1e-08),
  ('adam_beta1', 0.9),
  ('adam_beta2', 0.999),
  ('adam_epsilon', 1e-08),
  ('batch_size', 64),
  ('accum_batch_size', -1),
  ('max_grad_norm', 4.0),
  ('max_time_steps', 70),
  ('trigger_averaging_turns', 50),
  ('trigger_averaging_at_the_latest', 800),
  # learning rate
  ('learning_rate', 0.0006843582835180154),
  ('learning_rate_decay', 1.0),
  ('learning_rate_decay_burn_in_steps', 0),
  ('drop_learning_rate_turns', -1),
  ('drop_learning_rate_multiplier', 1.0),
  ('drop_learning_rate_at_the_latest', -1),
  # early stopping
  ('early_stopping_turns', -1),
  ('early_stopping_rampup_turns', 0),
  ('early_stopping_worst_xe_target', ''),
  ('early_stopping_slowest_rate', 0.0),
  # cross-validation
  ('crossvalidate', False),
  ('crossvalidation_folds', 10),
  ('crossvalidation_rounds', 1),
  # evaluation
  ('max_training_eval_batches', 20),
  ('max_eval_eval_batches', -1),
  ('max_test_eval_batches', -1),
  ('min_non_episodic_eval_examples_per_stripe', 100),
  ('eval_on_test', False),
  ('eval_method', 'deterministic'),
  ('num_eval_samples', 0),
  ('eval_softmax_temperature', -0.8),
  ('eval_softmax_temperature_estimation_num_tokens', 50000),
  ('eval_power_mean_power', 1.0),
  ('eval_dropout_multiplier', 1.0),
  ('validation_prediction_file', ''),
  ('dyneval', False),
  ('dyneval_learning_rate', 0.001),
  ('dyneval_decay_rate', 0.02),
  ('dyneval_epsilon', 1e-05),
  # experiments
  # checkpoints
  ('save_checkpoints', True),
  # misc
  ('seed', 1),
  ('swap_memory', False),
  ('log_device_placement', False),
  ('summary_flush_secs', 120),
]
