| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						model: Emotion2vec | 
					
					
						
						| 
							 | 
						model_conf: | 
					
					
						
						| 
							 | 
						  _name: data2vec_multi | 
					
					
						
						| 
							 | 
						  activation_dropout: 0.0 | 
					
					
						
						| 
							 | 
						  adversarial_hidden_dim: 128 | 
					
					
						
						| 
							 | 
						  adversarial_training: false | 
					
					
						
						| 
							 | 
						  adversarial_weight: 0.1 | 
					
					
						
						| 
							 | 
						  attention_dropout: 0.1 | 
					
					
						
						| 
							 | 
						  average_top_k_layers: 16 | 
					
					
						
						| 
							 | 
						  batch_norm_target_layer: false | 
					
					
						
						| 
							 | 
						  clone_batch: 12 | 
					
					
						
						| 
							 | 
						  cls_loss: 1.0 | 
					
					
						
						| 
							 | 
						  cls_type: chunk | 
					
					
						
						| 
							 | 
						  d2v_loss: 1.0 | 
					
					
						
						| 
							 | 
						  decoder_group: false | 
					
					
						
						| 
							 | 
						  depth: 8 | 
					
					
						
						| 
							 | 
						  dropout_input: 0.0 | 
					
					
						
						| 
							 | 
						  ema_anneal_end_step: 20000 | 
					
					
						
						| 
							 | 
						  ema_decay: 0.9997 | 
					
					
						
						| 
							 | 
						  ema_encoder_only: false | 
					
					
						
						| 
							 | 
						  ema_end_decay: 1.0 | 
					
					
						
						| 
							 | 
						  ema_same_dtype: true | 
					
					
						
						| 
							 | 
						  embed_dim: 1024 | 
					
					
						
						| 
							 | 
						  encoder_dropout: 0.1 | 
					
					
						
						| 
							 | 
						  end_drop_path_rate: 0.0 | 
					
					
						
						| 
							 | 
						  end_of_block_targets: false | 
					
					
						
						| 
							 | 
						  instance_norm_target_layer: true | 
					
					
						
						| 
							 | 
						  instance_norm_targets: false | 
					
					
						
						| 
							 | 
						  layer_norm_first: false | 
					
					
						
						| 
							 | 
						  layer_norm_target_layer: false | 
					
					
						
						| 
							 | 
						  layer_norm_targets: false | 
					
					
						
						| 
							 | 
						  layerdrop: 0.0 | 
					
					
						
						| 
							 | 
						  log_norms: true | 
					
					
						
						| 
							 | 
						  loss_beta: 0.0 | 
					
					
						
						| 
							 | 
						  loss_scale: null | 
					
					
						
						| 
							 | 
						  mae_init: false | 
					
					
						
						| 
							 | 
						  max_update: 100000 | 
					
					
						
						| 
							 | 
						  min_pred_var: 0.01 | 
					
					
						
						| 
							 | 
						  min_target_var: 0.1 | 
					
					
						
						| 
							 | 
						  mlp_ratio: 4.0 | 
					
					
						
						| 
							 | 
						  normalize: true | 
					
					
						
						| 
							 | 
						  modalities: | 
					
					
						
						| 
							 | 
						    _name: null | 
					
					
						
						| 
							 | 
						    audio: | 
					
					
						
						| 
							 | 
						      add_masks: false | 
					
					
						
						| 
							 | 
						      alibi_max_pos: null | 
					
					
						
						| 
							 | 
						      alibi_scale: 1.0 | 
					
					
						
						| 
							 | 
						      conv_pos_depth: 5 | 
					
					
						
						| 
							 | 
						      conv_pos_groups: 16 | 
					
					
						
						| 
							 | 
						      conv_pos_pre_ln: false | 
					
					
						
						| 
							 | 
						      conv_pos_width: 95 | 
					
					
						
						| 
							 | 
						      decoder: | 
					
					
						
						| 
							 | 
						        add_positions_all: false | 
					
					
						
						| 
							 | 
						        add_positions_masked: false | 
					
					
						
						| 
							 | 
						        decoder_dim: 768 | 
					
					
						
						| 
							 | 
						        decoder_groups: 16 | 
					
					
						
						| 
							 | 
						        decoder_kernel: 7 | 
					
					
						
						| 
							 | 
						        decoder_layers: 4 | 
					
					
						
						| 
							 | 
						        decoder_residual: true | 
					
					
						
						| 
							 | 
						        input_dropout: 0.1 | 
					
					
						
						| 
							 | 
						        projection_layers: 1 | 
					
					
						
						| 
							 | 
						        projection_ratio: 2.0 | 
					
					
						
						| 
							 | 
						      ema_local_encoder: false | 
					
					
						
						| 
							 | 
						      encoder_zero_mask: true | 
					
					
						
						| 
							 | 
						      end_drop_path_rate: 0.0 | 
					
					
						
						| 
							 | 
						      extractor_mode: layer_norm | 
					
					
						
						| 
							 | 
						      feature_encoder_spec: '[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]' | 
					
					
						
						| 
							 | 
						      init_extra_token_zero: true | 
					
					
						
						| 
							 | 
						      inverse_mask: false | 
					
					
						
						| 
							 | 
						      keep_masked_pct: 0.0 | 
					
					
						
						| 
							 | 
						      learned_alibi: false | 
					
					
						
						| 
							 | 
						      learned_alibi_scale: true | 
					
					
						
						| 
							 | 
						      learned_alibi_scale_per_head: true | 
					
					
						
						| 
							 | 
						      learned_alibi_scale_per_layer: false | 
					
					
						
						| 
							 | 
						      local_grad_mult: 1.0 | 
					
					
						
						| 
							 | 
						      mask_channel_length: 64 | 
					
					
						
						| 
							 | 
						      mask_channel_prob: 0.0 | 
					
					
						
						| 
							 | 
						      mask_dropout: 0.0 | 
					
					
						
						| 
							 | 
						      mask_length: 5 | 
					
					
						
						| 
							 | 
						      mask_noise_std: 0.01 | 
					
					
						
						| 
							 | 
						      mask_prob: 0.55 | 
					
					
						
						| 
							 | 
						      mask_prob_adjust: 0.1 | 
					
					
						
						| 
							 | 
						      mask_prob_min: null | 
					
					
						
						| 
							 | 
						      model_depth: 8 | 
					
					
						
						| 
							 | 
						      num_alibi_heads: 16 | 
					
					
						
						| 
							 | 
						      num_extra_tokens: 10 | 
					
					
						
						| 
							 | 
						      prenet_depth: 4 | 
					
					
						
						| 
							 | 
						      prenet_dropout: 0.1 | 
					
					
						
						| 
							 | 
						      prenet_layerdrop: 0.0 | 
					
					
						
						| 
							 | 
						      remove_masks: false | 
					
					
						
						| 
							 | 
						      start_drop_path_rate: 0.0 | 
					
					
						
						| 
							 | 
						      type: AUDIO | 
					
					
						
						| 
							 | 
						      use_alibi_encoder: true | 
					
					
						
						| 
							 | 
						    image: | 
					
					
						
						| 
							 | 
						      add_masks: false | 
					
					
						
						| 
							 | 
						      alibi_dims: 2 | 
					
					
						
						| 
							 | 
						      alibi_distance: manhattan | 
					
					
						
						| 
							 | 
						      alibi_max_pos: null | 
					
					
						
						| 
							 | 
						      alibi_scale: 1.0 | 
					
					
						
						| 
							 | 
						      decoder: | 
					
					
						
						| 
							 | 
						        add_positions_all: false | 
					
					
						
						| 
							 | 
						        add_positions_masked: false | 
					
					
						
						| 
							 | 
						        decoder_dim: 384 | 
					
					
						
						| 
							 | 
						        decoder_groups: 16 | 
					
					
						
						| 
							 | 
						        decoder_kernel: 5 | 
					
					
						
						| 
							 | 
						        decoder_layers: 5 | 
					
					
						
						| 
							 | 
						        decoder_residual: true | 
					
					
						
						| 
							 | 
						        input_dropout: 0.1 | 
					
					
						
						| 
							 | 
						        projection_layers: 1 | 
					
					
						
						| 
							 | 
						        projection_ratio: 2.0 | 
					
					
						
						| 
							 | 
						      ema_local_encoder: false | 
					
					
						
						| 
							 | 
						      embed_dim: 768 | 
					
					
						
						| 
							 | 
						      enc_dec_transformer: false | 
					
					
						
						| 
							 | 
						      encoder_zero_mask: true | 
					
					
						
						| 
							 | 
						      end_drop_path_rate: 0.0 | 
					
					
						
						| 
							 | 
						      fixed_positions: true | 
					
					
						
						| 
							 | 
						      in_chans: 3 | 
					
					
						
						| 
							 | 
						      init_extra_token_zero: true | 
					
					
						
						| 
							 | 
						      input_size: 224 | 
					
					
						
						| 
							 | 
						      inverse_mask: false | 
					
					
						
						| 
							 | 
						      keep_masked_pct: 0.0 | 
					
					
						
						| 
							 | 
						      learned_alibi: false | 
					
					
						
						| 
							 | 
						      learned_alibi_scale: false | 
					
					
						
						| 
							 | 
						      learned_alibi_scale_per_head: false | 
					
					
						
						| 
							 | 
						      learned_alibi_scale_per_layer: false | 
					
					
						
						| 
							 | 
						      local_grad_mult: 1.0 | 
					
					
						
						| 
							 | 
						      mask_channel_length: 64 | 
					
					
						
						| 
							 | 
						      mask_channel_prob: 0.0 | 
					
					
						
						| 
							 | 
						      mask_dropout: 0.0 | 
					
					
						
						| 
							 | 
						      mask_length: 5 | 
					
					
						
						| 
							 | 
						      mask_noise_std: 0.01 | 
					
					
						
						| 
							 | 
						      mask_prob: 0.7 | 
					
					
						
						| 
							 | 
						      mask_prob_adjust: 0.0 | 
					
					
						
						| 
							 | 
						      mask_prob_min: null | 
					
					
						
						| 
							 | 
						      model_depth: 8 | 
					
					
						
						| 
							 | 
						      num_alibi_heads: 16 | 
					
					
						
						| 
							 | 
						      num_extra_tokens: 0 | 
					
					
						
						| 
							 | 
						      patch_size: 16 | 
					
					
						
						| 
							 | 
						      prenet_depth: 4 | 
					
					
						
						| 
							 | 
						      prenet_dropout: 0.0 | 
					
					
						
						| 
							 | 
						      prenet_layerdrop: 0.0 | 
					
					
						
						| 
							 | 
						      remove_masks: false | 
					
					
						
						| 
							 | 
						      start_drop_path_rate: 0.0 | 
					
					
						
						| 
							 | 
						      transformer_decoder: false | 
					
					
						
						| 
							 | 
						      type: IMAGE | 
					
					
						
						| 
							 | 
						      use_alibi_encoder: false | 
					
					
						
						| 
							 | 
						    text: | 
					
					
						
						| 
							 | 
						      add_masks: false | 
					
					
						
						| 
							 | 
						      alibi_max_pos: null | 
					
					
						
						| 
							 | 
						      alibi_scale: 1.0 | 
					
					
						
						| 
							 | 
						      decoder: | 
					
					
						
						| 
							 | 
						        add_positions_all: false | 
					
					
						
						| 
							 | 
						        add_positions_masked: false | 
					
					
						
						| 
							 | 
						        decoder_dim: 384 | 
					
					
						
						| 
							 | 
						        decoder_groups: 16 | 
					
					
						
						| 
							 | 
						        decoder_kernel: 5 | 
					
					
						
						| 
							 | 
						        decoder_layers: 5 | 
					
					
						
						| 
							 | 
						        decoder_residual: true | 
					
					
						
						| 
							 | 
						        input_dropout: 0.1 | 
					
					
						
						| 
							 | 
						        projection_layers: 1 | 
					
					
						
						| 
							 | 
						        projection_ratio: 2.0 | 
					
					
						
						| 
							 | 
						      dropout: 0.1 | 
					
					
						
						| 
							 | 
						      ema_local_encoder: false | 
					
					
						
						| 
							 | 
						      encoder_zero_mask: true | 
					
					
						
						| 
							 | 
						      end_drop_path_rate: 0.0 | 
					
					
						
						| 
							 | 
						      init_extra_token_zero: true | 
					
					
						
						| 
							 | 
						      inverse_mask: false | 
					
					
						
						| 
							 | 
						      keep_masked_pct: 0.0 | 
					
					
						
						| 
							 | 
						      layernorm_embedding: true | 
					
					
						
						| 
							 | 
						      learned_alibi: false | 
					
					
						
						| 
							 | 
						      learned_alibi_scale: false | 
					
					
						
						| 
							 | 
						      learned_alibi_scale_per_head: false | 
					
					
						
						| 
							 | 
						      learned_alibi_scale_per_layer: false | 
					
					
						
						| 
							 | 
						      learned_pos: true | 
					
					
						
						| 
							 | 
						      local_grad_mult: 1.0 | 
					
					
						
						| 
							 | 
						      mask_channel_length: 64 | 
					
					
						
						| 
							 | 
						      mask_channel_prob: 0.0 | 
					
					
						
						| 
							 | 
						      mask_dropout: 0.0 | 
					
					
						
						| 
							 | 
						      mask_length: 5 | 
					
					
						
						| 
							 | 
						      mask_noise_std: 0.01 | 
					
					
						
						| 
							 | 
						      mask_prob: 0.7 | 
					
					
						
						| 
							 | 
						      mask_prob_adjust: 0.0 | 
					
					
						
						| 
							 | 
						      mask_prob_min: null | 
					
					
						
						| 
							 | 
						      max_source_positions: 512 | 
					
					
						
						| 
							 | 
						      model_depth: 8 | 
					
					
						
						| 
							 | 
						      no_scale_embedding: true | 
					
					
						
						| 
							 | 
						      no_token_positional_embeddings: false | 
					
					
						
						| 
							 | 
						      num_alibi_heads: 16 | 
					
					
						
						| 
							 | 
						      num_extra_tokens: 0 | 
					
					
						
						| 
							 | 
						      prenet_depth: 4 | 
					
					
						
						| 
							 | 
						      prenet_dropout: 0.0 | 
					
					
						
						| 
							 | 
						      prenet_layerdrop: 0.0 | 
					
					
						
						| 
							 | 
						      remove_masks: false | 
					
					
						
						| 
							 | 
						      start_drop_path_rate: 0.0 | 
					
					
						
						| 
							 | 
						      type: TEXT | 
					
					
						
						| 
							 | 
						      use_alibi_encoder: false | 
					
					
						
						| 
							 | 
						  norm_affine: true | 
					
					
						
						| 
							 | 
						  norm_eps: 1.0e-05 | 
					
					
						
						| 
							 | 
						  num_heads: 16 | 
					
					
						
						| 
							 | 
						  post_mlp_drop: 0.1 | 
					
					
						
						| 
							 | 
						  recon_loss: 0.0 | 
					
					
						
						| 
							 | 
						  seed: 1 | 
					
					
						
						| 
							 | 
						  shared_decoder: null | 
					
					
						
						| 
							 | 
						  skip_ema: false | 
					
					
						
						| 
							 | 
						  start_drop_path_rate: 0.0 | 
					
					
						
						| 
							 | 
						  supported_modality: AUDIO | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						tokenizer: CharTokenizer | 
					
					
						
						| 
							 | 
						tokenizer_conf: | 
					
					
						
						| 
							 | 
						  unk_symbol: <unk> | 
					
					
						
						| 
							 | 
						  split_with_space: true | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						scope_map: | 
					
					
						
						| 
							 | 
						  - 'd2v_model.' | 
					
					
						
						| 
							 | 
						  - none | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 |