Upload 3 files

Browse files

Files changed (4) hide show

.gitattributes +1 -0
arjun_das_output_audio.mp3 +3 -0
handler.py +41 -0
requirements.txt +75 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+arjun_das_output_audio.mp3 filter=lfs diff=lfs merge=lfs -text

arjun_das_output_audio.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c2a2dcb4c1e2e75c316cbfb4fb38a8f4b63641082ee2dfb9613a43274a30e65f
+size 207654

handler.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import torchaudio as ta
+from chatterbox.tts import ChatterboxTTS
+from typing import Dict, Any, List
+import soundfile as sf
+import io
+import base64
+class EndpointHandler:
+    def __init__(self, path: str = ""):
+        try:
+            self.model = ChatterboxTTS.from_pretrained(device="cuda")
+        except Exception as e:
+            raise RuntimeError(f"[ERROR] Failed to load model: {e}")
+    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: #, data: Dict[str, Any]) -> List[Dict[str, Any]]
+        try:
+            inputs = data.get("inputs", {})
+            text = inputs.get("text")
+            exaggeration = inputs.get("exaggeration", 0.3)
+            cfg_weight = inputs.get("cfg_weight", 0.5)
+            print(exaggeration, cfg_weight)
+            AUDIO_PROMPT_PATH="arjun_das_output_audio.mp3"
+            wav = self.model.generate(text, audio_prompt_path=AUDIO_PROMPT_PATH, exaggeration = exaggeration, cfg_weight=cfg_weight)
+            # ta.save("test-2.wav", wav, self.model.sr)
+            # Convert the tensor to numpy and write to an in-memory buffer
+            buffer = io.BytesIO()
+            sf.write(buffer, wav.cpu().numpy().T, self.model.sr, format='WAV')
+            buffer.seek(0)
+            # Encode to base64
+            audio_base64 = base64.b64encode(buffer.read()).decode('utf-8')
+            return [{"audio_base64": audio_base64}]
+        except Exception as e:
+            print(f"[ERROR] Inference failed: {e}")
+            return [{"error": str(e)}]

requirements.txt ADDED Viewed

	@@ -0,0 +1,75 @@

+audioread==3.0.1
+certifi==2025.6.15
+cffi==1.17.1
+cfgv==3.4.0
+charset-normalizer==3.4.2
+chatterbox-tts==0.1.2
+conformer==0.3.2
+decorator==5.2.1
+diffusers==0.29.0
+distlib==0.3.9
+einops==0.8.1
+filelock==3.18.0
+fsspec==2025.5.1
+hf-xet==1.1.5
+huggingface-hub==0.33.1
+identify==2.6.12
+idna==3.10
+importlib_metadata==8.7.0
+Jinja2==3.1.6
+joblib==1.5.1
+lazy_loader==0.4
+librosa==0.11.0
+llvmlite==0.44.0
+MarkupSafe==3.0.2
+mpmath==1.3.0
+msgpack==1.1.1
+networkx==3.5
+nodeenv==1.9.1
+numba==0.61.2
+numpy==2.2.6
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-cufile-cu12==1.11.1.6
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-cusparselt-cu12==0.6.2
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+onnx==1.18.0
+packaging==25.0
+pillow==11.2.1
+platformdirs==4.3.8
+pooch==1.8.2
+pre_commit==4.2.0
+protobuf==6.31.1
+pycparser==2.22
+PyYAML==6.0.2
+regex==2024.11.6
+requests==2.32.4
+resemble-perth==1.0.1
+s3tokenizer==0.1.7
+safetensors==0.5.3
+scikit-learn==1.7.0
+scipy==1.16.0
+setuptools==80.9.0
+soundfile==0.13.1
+soxr==0.5.0.post1
+sympy==1.13.1
+threadpoolctl==3.6.0
+tokenizers==0.20.3
+torch==2.6.0
+torchaudio==2.6.0
+tqdm==4.67.1
+transformers==4.46.3
+triton==3.2.0
+typing_extensions==4.14.0
+urllib3==2.5.0
+virtualenv==20.31.2
+zipp==3.23.0