Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

README.md +20 -0
bigvgan_generator.safetensors +3 -0
config.json +178 -0
gpt.safetensors +3 -0
model.safetensors +3 -0
model.safetensors.index.json +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,20 @@

+---
+license: apache-2.0
+pipeline_tag: text-to-speech
+library_name: index-tts
+tags:
+- mlx
+---
+# mlx-community/IndexTTS
+This model was converted to MLX format from [`IndexTeam/Index-TTS`](https://huggingface.co/IndexTeam/Index-TTS) using mlx-audio version **0.2.3**.
+Refer to the [original model card](https://huggingface.co/IndexTeam/Index-TTS) for more details on the model.
+## Use with mlx
+```bash
+pip install -U mlx-audio
+```
+```bash
+python -m mlx_audio.tts.generate --model mlx-community/IndexTTS --text "Describe this image."
+```

bigvgan_generator.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:84d41b095e801f1cf01a56feabe9e5381cc97792f43d9791a31b0bf83da2c136
+size 524895512

config.json ADDED Viewed

	@@ -0,0 +1,178 @@

+{
+    "bigvgan": {
+        "adam_b1": 0.8,
+        "adam_b2": 0.99,
+        "lr_decay": 0.999998,
+        "seed": 1234,
+        "resblock": "1",
+        "upsample_rates": [
+            4,
+            4,
+            4,
+            4,
+            2,
+            2
+        ],
+        "upsample_kernel_sizes": [
+            8,
+            8,
+            4,
+            4,
+            4,
+            4
+        ],
+        "upsample_initial_channel": 1536,
+        "resblock_kernel_sizes": [
+            3,
+            7,
+            11
+        ],
+        "resblock_dilation_sizes": [
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ]
+        ],
+        "feat_upsample": false,
+        "speaker_embedding_dim": 512,
+        "cond_d_vector_in_each_upsampling_layer": true,
+        "gpt_dim": 1024,
+        "activation": "snakebeta",
+        "snake_logscale": true,
+        "use_cqtd_instead_of_mrd": true,
+        "cqtd_filters": 128,
+        "cqtd_max_filters": 1024,
+        "cqtd_filters_scale": 1,
+        "cqtd_dilations": [
+            1,
+            2,
+            4
+        ],
+        "cqtd_hop_lengths": [
+            512,
+            256,
+            256
+        ],
+        "cqtd_n_octaves": [
+            9,
+            9,
+            9
+        ],
+        "cqtd_bins_per_octaves": [
+            24,
+            36,
+            48
+        ],
+        "resolutions": [
+            [
+                1024,
+                120,
+                600
+            ],
+            [
+                2048,
+                240,
+                1200
+            ],
+            [
+                512,
+                50,
+                240
+            ]
+        ],
+        "mpd_reshapes": [
+            2,
+            3,
+            5,
+            7,
+            11
+        ],
+        "use_spectral_norm": false,
+        "discriminator_channel_mult": 1,
+        "use_multiscale_melloss": true,
+        "lambda_melloss": 15,
+        "clip_grad_norm": 1000,
+        "segment_size": 16384,
+        "num_mels": 100,
+        "num_freq": 1025,
+        "n_fft": 1024,
+        "hop_size": 256,
+        "win_size": 1024,
+        "sampling_rate": 24000,
+        "fmin": 0,
+        "fmax": null,
+        "fmax_for_loss": null,
+        "mel_type": "pytorch",
+        "num_workers": 2,
+        "dist_config": {
+            "dist_backend": "nccl",
+            "dist_url": "tcp://localhost:54321",
+            "world_size": 1
+        }
+    },
+    "bigvgan_checkpoint": "bigvgan_generator.pth",
+    "dataset": {
+        "bpe_model": "checkpoints/bpe.model",
+        "sample_rate": 24000,
+        "squeeze": false,
+        "mel": {
+            "sample_rate": 24000,
+            "n_fft": 1024,
+            "hop_length": 256,
+            "win_length": 1024,
+            "n_mels": 100,
+            "mel_fmin": 0,
+            "normalize": false
+        }
+    },
+    "dvae_checkpoint": "dvae.pth",
+    "gpt": {
+        "model_dim": 1024,
+        "max_mel_tokens": 605,
+        "max_text_tokens": 402,
+        "heads": 16,
+        "use_mel_codes_as_input": true,
+        "mel_length_compression": 1024,
+        "layers": 20,
+        "number_text_tokens": 12000,
+        "number_mel_codes": 8194,
+        "start_mel_token": 8192,
+        "stop_mel_token": 8193,
+        "start_text_token": 0,
+        "stop_text_token": 1,
+        "train_solo_embeddings": false,
+        "condition_type": "conformer_perceiver",
+        "condition_module": {
+            "output_size": 512,
+            "linear_units": 2048,
+            "attention_heads": 8,
+            "num_blocks": 6,
+            "input_layer": "conv2d2",
+            "perceiver_mult": 2
+        }
+    },
+    "gpt_checkpoint": "gpt.pth",
+    "vqvae": {
+        "channels": 100,
+        "num_tokens": 8192,
+        "hidden_dim": 512,
+        "num_resnet_blocks": 3,
+        "codebook_dim": 512,
+        "num_layers": 2,
+        "positional_dims": 1,
+        "kernel_size": 3,
+        "smooth_l1_loss": true,
+        "use_transposed_convs": false
+    }
+}

gpt.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b2921cef3d321d4759d6f85c0df42c1a55d84c6adc0a12637d8e7b2c424fa6bc
+size 696434870

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1a4b1b52c4078878b010ba47224c93ed857261d59b9af38acca5e5c4744046ec
+size 953746416

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff