Upload 8 files

Browse files

Files changed (8) hide show

MDX23v24/models/MDX23C-8KFFT-InstVoc_HQ.ckpt +3 -0
MDX23v24/models/config_vocals_segm_models.yaml +48 -0
MDX23v24/models/model.safetensors +3 -0
MDX23v24/models/model_2_stem_061321.yaml +36 -0
MDX23v24/models/model_2_stem_full_band_8k.yaml +43 -0
MDX23v24/models/model_bs_roformer_ep_317_sdr_12.9755.ckpt +3 -0
MDX23v24/models/model_bs_roformer_ep_317_sdr_12.9755.yaml +133 -0
MDX23v24/models/model_vocals_segm_models_sdr_9.77.ckpt +3 -0

MDX23v24/models/MDX23C-8KFFT-InstVoc_HQ.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:49d51472769e34a2501cd1da782346a3212555c3a5619fc2c53507445528d816
+size 448101203

MDX23v24/models/config_vocals_segm_models.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+audio:
+  chunk_size: 261632
+  dim_f: 4096
+  dim_t: 512
+  hop_length: 512
+  n_fft: 8192
+  num_channels: 2
+  sample_rate: 44100
+  min_mean_abs: 0.001
+model:
+  encoder_name: tu-maxvit_large_tf_512 # look here for possibilities: https://github.com/qubvel/segmentation_models.pytorch#encoders-
+  decoder_type: unet # unet, fpn
+  act: gelu
+  num_channels: 128
+  num_subbands: 8
+training:
+  batch_size: 8
+  gradient_accumulation_steps: 1
+  grad_clip: 0
+  instruments:
+  - vocals
+  - other
+  lr: 5.0e-05
+  patience: 2
+  reduce_factor: 0.95
+  target_instrument: null
+  num_epochs: 1000
+  num_steps: 2000
+  augmentation: false # enable augmentations by audiomentations and pedalboard
+  augmentation_type: simple1
+  use_mp3_compress: false # Deprecated
+  augmentation_mix: true # Mix several stems of the same type with some probability
+  augmentation_loudness: true # randomly change loudness of each stem
+  augmentation_loudness_type: 1 # Type 1 or 2
+  augmentation_loudness_min: 0.5
+  augmentation_loudness_max: 1.5
+  q: 0.95
+  coarse_loss_clip: true
+  ema_momentum: 0.999
+  optimizer: adamw
+  other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
+inference:
+  batch_size: 1
+  dim_t: 512
+  num_overlap: 4

MDX23v24/models/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7c44aad6e89377d68458a95c6356730f14a1c742a10b6426608c50199e86fb04
+size 850242572

MDX23v24/models/model_2_stem_061321.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+audio:
+  chunk_size: 260096
+  dim_f: 4096
+  dim_t: 256
+  hop_length: 2048
+  n_fft: 12288
+  num_channels: 2
+  sample_rate: 44100
+  min_mean_abs: 0.001
+model:
+  act: gelu
+  bottleneck_factor: 4
+  growth: 64
+  norm: InstanceNorm
+  num_blocks_per_scale: 2
+  num_channels: 128
+  num_scales: 5
+  num_subbands: 4
+  scale:
+  - 2
+  - 2
+  name: epoch_10.ckpt
+training:
+  batch_size: 16
+  grad_clip: 0
+  instruments:
+  - Vocals
+  - Instrumental
+  lr: 5.0e-05
+  target_instrument: null
+  num_epochs: 100
+  num_steps: 1000
+inference:
+  batch_size: 1
+  dim_t: 256
+  num_overlap: 8

MDX23v24/models/model_2_stem_full_band_8k.yaml ADDED Viewed

	@@ -0,0 +1,43 @@

+audio:
+  chunk_size: 261120
+  dim_f: 4096
+  dim_t: 256
+  hop_length: 1024
+  n_fft: 8192
+  num_channels: 2
+  sample_rate: 44100
+  min_mean_abs: 0.001
+model:
+  act: gelu
+  bottleneck_factor: 4
+  growth: 128
+  norm: InstanceNorm
+  num_blocks_per_scale: 2
+  num_channels: 128
+  num_scales: 5
+  num_subbands: 4
+  scale:
+  - 2
+  - 2
+training:
+  batch_size: 6
+  grad_clip: 0
+  instruments:
+  - Vocals
+  - Instrumental
+  lr: 1.0e-05
+  patience: 2
+  reduce_factor: 0.95
+  target_instrument: null
+  num_epochs: 1000
+  num_steps: 1000
+  augmentation: 1
+  augmentation_type: simple1
+  augmentation_mix: true
+  q: 0.95
+  coarse_loss_clip: true
+  ema_momentum: 0.999
+inference:
+  batch_size: 1
+  dim_t: 256
+  num_overlap: 8

MDX23v24/models/model_bs_roformer_ep_317_sdr_12.9755.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b84f37e8d444c8cb30c79d77f613a41c05868ff9c9ac6c7049c00aefae115aa
+size 639331213

MDX23v24/models/model_bs_roformer_ep_317_sdr_12.9755.yaml ADDED Viewed

	@@ -0,0 +1,133 @@

+audio:
+  chunk_size: 352800
+  dim_f: 1024
+  dim_t: 801 # don't work (use in model)
+  hop_length: 441 # don't work (use in model)
+  n_fft: 2048
+  num_channels: 2
+  sample_rate: 44100
+  min_mean_abs: 0.001
+model:
+  dim: 512
+  depth: 12
+  stereo: true
+  num_stems: 1
+  time_transformer_depth: 1
+  freq_transformer_depth: 1
+  freqs_per_bands: !!python/tuple
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 4
+    - 4
+    - 4
+    - 4
+    - 4
+    - 4
+    - 4
+    - 4
+    - 4
+    - 4
+    - 4
+    - 4
+    - 12
+    - 12
+    - 12
+    - 12
+    - 12
+    - 12
+    - 12
+    - 12
+    - 24
+    - 24
+    - 24
+    - 24
+    - 24
+    - 24
+    - 24
+    - 24
+    - 48
+    - 48
+    - 48
+    - 48
+    - 48
+    - 48
+    - 48
+    - 48
+    - 128
+    - 129
+  dim_head: 64
+  heads: 8
+  attn_dropout: 0.1
+  ff_dropout: 0.1
+  flash_attn: true
+  dim_freqs_in: 1025
+  stft_n_fft: 2048
+  stft_hop_length: 441
+  stft_win_length: 2048
+  stft_normalized: false
+  mask_estimator_depth: 2
+  multi_stft_resolution_loss_weight: 1.0
+  multi_stft_resolutions_window_sizes: !!python/tuple
+  - 4096
+  - 2048
+  - 1024
+  - 512
+  - 256
+  multi_stft_hop_size: 147
+  multi_stft_normalized: False
+training:
+  batch_size: 16
+  gradient_accumulation_steps: 1
+  grad_clip: 0
+  instruments:
+  - Vocals
+  - Instrumental
+  lr: 5.0e-05
+  patience: 2
+  reduce_factor: 0.95
+  target_instrument: Vocals
+  num_epochs: 1000
+  num_steps: 1000
+  augmentation: false # enable augmentations by audiomentations and pedalboard
+  augmentation_type: simple1
+  use_mp3_compress: false # Deprecated
+  augmentation_mix: true # Mix several stems of the same type with some probability
+  augmentation_loudness: true # randomly change loudness of each stem
+  augmentation_loudness_type: 1 # Type 1 or 2
+  augmentation_loudness_min: 0.5
+  augmentation_loudness_max: 1.5
+  q: 0.95
+  coarse_loss_clip: true
+  ema_momentum: 0.999
+  optimizer: adam
+  other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
+  use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
+inference:
+  batch_size: 1
+  dim_t: 801
+  num_overlap: 4

MDX23v24/models/model_vocals_segm_models_sdr_9.77.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9cb6e969309f96602318fcf5970a6973899db86e5fd9d8f9cf8f15bacdd299bb
+size 863683537