File size: 5,440 Bytes
1a48690 9a5d0d8 b642558 9a5d0d8 b642558 9a5d0d8 b642558 9a5d0d8 431505b 9a5d0d8 b642558 9a5d0d8 b642558 9a5d0d8 b642558 9a5d0d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
---
language:
- ja
base_model:
- webbigdata/VoiceCore
tags:
- tts
- vllm
---
# VoiceCore_smoothquant
[webbigdata/VoiceCore](https://huggingface.co/webbigdata/VoiceCore)をvLLMで高速に動かすためにsmoothquant(W8A8)量子化したモデルです
詳細は[webbigdata/VoiceCore](https://huggingface.co/webbigdata/VoiceCore)のモデルカードを御覧ください
This is a model quantized using smoothquant (W8A8) to run [webbigdata/VoiceCore](https://huggingface.co/webbigdata/VoiceCore) at high speed using vLLM.
See the [webbigdata/VoiceCore](https://huggingface.co/webbigdata/VoiceCore) model card for details.
## Install/Setup
[vLLMはAMDのGPUでも動作する](https://docs.vllm.ai/en/v0.6.5/getting_started/amd-installation.html)そうですがチェックは出来ていません。
Mac(CPU)でも動くようですが、[gguf版](https://huggingface.co/webbigdata/VoiceCore_gguf)を使った方が早いかもしれません
vLLM seems to work with [AMD GPUs](https://docs.vllm.ai/en/v0.6.5/getting_started/amd-installation.html), but I haven't checked.
It also seems to work with Mac (CPU), but [gguf version](https://huggingface.co/webbigdata/VoiceCore_gguf) seems to be better.
以下はLinuxのNvidia GPU版のセットアップ手順です
Below are the setup instructions for the Nvidia GPU version of Linux.
```
python3 -m venv VL
source VL/bin/activate
pip install vllm
pip install snac
pip install numpy==1.26.4
pip install transformers==4.53.2
```
## Sample script
```
import torch
import scipy.io.wavfile as wavfile
from transformers import AutoTokenizer
from snac import SNAC
from vllm import LLM, SamplingParams
QUANTIZED_MODEL_PATH = "webbigdata/VoiceCore_smoothquant"
prompts = [
"テストです",
"スムーズクアント、問題なく動いてますかね?圧縮しすぎると別人の声になっちゃう事があるんですよね、ふふふ"
]
chosen_voice = "matsukaze_male[neutral]"
print("Loading tokenizer and preparing inputs...")
tokenizer = AutoTokenizer.from_pretrained(QUANTIZED_MODEL_PATH)
prompts_ = [(f"{chosen_voice}: " + p) if chosen_voice else p for p in prompts]
start_token, end_tokens = [128259], [128009, 128260, 128261]
all_prompt_token_ids = []
for prompt in prompts_:
input_ids = tokenizer.encode(prompt)
final_token_ids = start_token + input_ids + end_tokens
all_prompt_token_ids.append(final_token_ids)
print("Inputs prepared successfully.")
print(f"Loading SmoothQuant model with vLLM from: {QUANTIZED_MODEL_PATH}")
llm = LLM(
model=QUANTIZED_MODEL_PATH,
trust_remote_code=True,
max_model_len=10000, # メモリ不足になる場合は減らしてください f you run out of memory, reduce it.
#gpu_memory_utilization=0.9 # 「最大GPUメモリの何割を使うか?」適宜調整してください "What percentage of the maximum GPU memory should be used?" Adjust accordingly.
)
sampling_params = SamplingParams(
temperature=0.6,
top_p=0.90,
repetition_penalty=1.1,
max_tokens=8192, # max_tokens + input_prompt <= max_model_len
stop_token_ids=[128258]
)
print("vLLM model loaded.")
print("Generating audio tokens with vLLM...")
outputs = llm.generate(prompt_token_ids=all_prompt_token_ids, sampling_params=sampling_params)
print("Generation complete.")
# GPUの方が早いがvllmが大きくメモリ確保していると失敗するため GPU is faster, but if vllm allocates a lot of memory it will fail to run.
print("Loading SNAC decoder to CPU...")
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
snac_model.to("cpu")
print("SNAC model loaded.")
print("Decoding tokens to audio...")
audio_start_token = 128257
def redistribute_codes(code_list):
layer_1, layer_2, layer_3 = [], [], []
for i in range(len(code_list) // 7):
layer_1.append(code_list[7*i])
layer_2.append(code_list[7*i+1] - 4096)
layer_3.append(code_list[7*i+2] - (2*4096))
layer_3.append(code_list[7*i+3] - (3*4096))
layer_2.append(code_list[7*i+4] - (4*4096))
layer_3.append(code_list[7*i+5] - (5*4096))
layer_3.append(code_list[7*i+6] - (6*4096))
codes = [torch.tensor(layer).unsqueeze(0)
for layer in [layer_1, layer_2, layer_3]]
audio_hat = snac_model.decode(codes)
return audio_hat
code_lists = []
for output in outputs:
generated_token_ids = output.outputs[0].token_ids
generated_tensor = torch.tensor([generated_token_ids])
token_indices = (generated_tensor == audio_start_token).nonzero(as_tuple=True)
if len(token_indices[1]) > 0:
cropped_tensor = generated_tensor[:, token_indices[1][-1].item() + 1:]
else:
cropped_tensor = generated_tensor
masked_row = cropped_tensor.squeeze()
row_length = masked_row.size(0)
new_length = (row_length // 7) * 7
trimmed_row = masked_row[:new_length]
code_list = [t.item() - 128266 for t in trimmed_row]
code_lists.append(code_list)
for i, code_list in enumerate(code_lists):
if i >= len(prompts): break
print(f"Processing audio for prompt: '{prompts[i]}'")
samples = redistribute_codes(code_list)
sample_np = samples.detach().squeeze().numpy()
safe_prompt = "".join(c for c in prompts[i] if c.isalnum() or c in (' ', '_')).rstrip()
filename = f"audio_final_{i}_{safe_prompt[:20].replace(' ', '_')}.wav"
wavfile.write(filename, 24000, sample_np)
print(f"Saved audio to: {filename}")
```
|