Commit
·
ac02922
1
Parent(s):
e234d89
Upload Ming-Lite-Omni-1.5 safetensors
Browse files- talker/{audio_detokenizer.yaml → audio_detokenizer_stream.yaml} +31 -22
- transformer/diffusion_pytorch_model-00002-of-00002.safetensors → talker/flow_stream.pt +2 -2
- talker/{hift.pt → hift_v2.pt} +2 -2
- talker/ossutil_output/ossutil_report_20250716_170346.report +1 -0
- talker/flow.pt → transformer/diffusion_pytorch_model-00001-of-00004.safetensors +2 -2
- transformer/{diffusion_pytorch_model-00001-of-00002.safetensors → diffusion_pytorch_model-00002-of-00004.safetensors} +2 -2
- transformer/diffusion_pytorch_model-00003-of-00004.safetensors +3 -0
- transformer/diffusion_pytorch_model-00004-of-00004.safetensors +3 -0
- transformer/diffusion_pytorch_model.safetensors.index.json +0 -0
talker/{audio_detokenizer.yaml → audio_detokenizer_stream.yaml}
RENAMED
@@ -5,21 +5,30 @@ __set_seed3: !apply:torch.manual_seed [1986]
|
|
5 |
__set_seed4: !apply:torch.cuda.manual_seed_all [1986]
|
6 |
|
7 |
# fixed params
|
8 |
-
sample_rate:
|
9 |
-
|
10 |
-
|
11 |
-
llm_output_size: 1024
|
12 |
spk_embed_dim: 192
|
|
|
|
|
|
|
13 |
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
15 |
input_size: 512
|
16 |
output_size: 80
|
17 |
spk_embed_dim: !ref <spk_embed_dim>
|
18 |
output_type: 'mel'
|
19 |
vocab_size: 4096
|
20 |
-
input_frame_rate:
|
21 |
only_mask_loss: True
|
22 |
-
|
|
|
|
|
23 |
output_size: 512
|
24 |
attention_heads: 8
|
25 |
linear_units: 2048
|
@@ -34,14 +43,11 @@ flow: !new:.audio_detokenizer.flow.flow.MaskedDiffWithXvec
|
|
34 |
input_size: 512
|
35 |
use_cnn_module: False
|
36 |
macaron_style: False
|
37 |
-
|
38 |
-
|
39 |
-
sampling_ratios: [1, 1, 1, 1]
|
40 |
-
decoder: !new:.audio_detokenizer.flow.flow_matching.ConditionalCFM
|
41 |
in_channels: 240
|
42 |
n_spks: 1
|
43 |
spk_emb_dim: 80
|
44 |
-
tensorrt_model_path: 'bin/ckpt_300M/estimator_fp16.plan'
|
45 |
cfm_params: !new:omegaconf.DictConfig
|
46 |
content:
|
47 |
sigma_min: 1e-06
|
@@ -50,16 +56,18 @@ flow: !new:.audio_detokenizer.flow.flow.MaskedDiffWithXvec
|
|
50 |
training_cfg_rate: 0.2
|
51 |
inference_cfg_rate: 0.7
|
52 |
reg_loss_type: 'l1'
|
53 |
-
estimator: !new:.audio_detokenizer.flow.decoder.
|
54 |
in_channels: 320
|
55 |
out_channels: 80
|
56 |
-
channels: [256
|
57 |
-
dropout: 0
|
58 |
attention_head_dim: 64
|
59 |
n_blocks: 4
|
60 |
num_mid_blocks: 12
|
61 |
num_heads: 8
|
62 |
act_fn: 'gelu'
|
|
|
|
|
63 |
|
64 |
hift: !new:.audio_detokenizer.hifigan.generator.HiFTGenerator
|
65 |
in_channels: 80
|
@@ -69,15 +77,15 @@ hift: !new:.audio_detokenizer.hifigan.generator.HiFTGenerator
|
|
69 |
nsf_alpha: 0.1
|
70 |
nsf_sigma: 0.003
|
71 |
nsf_voiced_threshold: 10
|
72 |
-
upsample_rates: [8,
|
73 |
-
upsample_kernel_sizes: [16,
|
74 |
istft_params:
|
75 |
n_fft: 16
|
76 |
hop_len: 4
|
77 |
resblock_kernel_sizes: [3, 7, 11]
|
78 |
resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
|
79 |
-
source_resblock_kernel_sizes: [7, 11]
|
80 |
-
source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
|
81 |
lrelu_slope: 0.1
|
82 |
audio_limit: 0.99
|
83 |
f0_predictor: !new:.audio_detokenizer.hifigan.f0_predictor.ConvRNNF0Predictor
|
@@ -85,12 +93,13 @@ hift: !new:.audio_detokenizer.hifigan.generator.HiFTGenerator
|
|
85 |
in_channels: 80
|
86 |
cond_channels: 512
|
87 |
|
|
|
88 |
feat_extractor: !name:matcha.utils.audio.mel_spectrogram
|
89 |
-
n_fft:
|
90 |
num_mels: 80
|
91 |
sampling_rate: !ref <sample_rate>
|
92 |
-
hop_size:
|
93 |
-
win_size:
|
94 |
fmin: 0
|
95 |
fmax: 8000
|
96 |
center: False
|
|
|
5 |
__set_seed4: !apply:torch.cuda.manual_seed_all [1986]
|
6 |
|
7 |
# fixed params
|
8 |
+
sample_rate: 24000
|
9 |
+
llm_input_size: 896
|
10 |
+
llm_output_size: 896
|
|
|
11 |
spk_embed_dim: 192
|
12 |
+
qwen_pretrain_path: ''
|
13 |
+
token_frame_rate: 50
|
14 |
+
token_mel_ratio: 1
|
15 |
|
16 |
+
# stream related params
|
17 |
+
chunk_size: 50 # streaming inference chunk size, in token
|
18 |
+
num_decoding_left_chunks: -1 # streaming inference flow decoder left chunk size, <0 means use all left chunks
|
19 |
+
|
20 |
+
|
21 |
+
flow: !new:.audio_detokenizer.flow.flow.CausalMaskedDiffWithXvec
|
22 |
input_size: 512
|
23 |
output_size: 80
|
24 |
spk_embed_dim: !ref <spk_embed_dim>
|
25 |
output_type: 'mel'
|
26 |
vocab_size: 4096
|
27 |
+
input_frame_rate: !ref <token_frame_rate>
|
28 |
only_mask_loss: True
|
29 |
+
token_mel_ratio: !ref <token_mel_ratio>
|
30 |
+
pre_lookahead_len: 3
|
31 |
+
encoder: !new:.audio_detokenizer.transformer.upsample_encoder_new_mel.UpsampleConformerEncoder
|
32 |
output_size: 512
|
33 |
attention_heads: 8
|
34 |
linear_units: 2048
|
|
|
43 |
input_size: 512
|
44 |
use_cnn_module: False
|
45 |
macaron_style: False
|
46 |
+
static_chunk_size: !ref <chunk_size>
|
47 |
+
decoder: !new:.audio_detokenizer.flow.flow_matching.CausalConditionalCFM
|
|
|
|
|
48 |
in_channels: 240
|
49 |
n_spks: 1
|
50 |
spk_emb_dim: 80
|
|
|
51 |
cfm_params: !new:omegaconf.DictConfig
|
52 |
content:
|
53 |
sigma_min: 1e-06
|
|
|
56 |
training_cfg_rate: 0.2
|
57 |
inference_cfg_rate: 0.7
|
58 |
reg_loss_type: 'l1'
|
59 |
+
estimator: !new:.audio_detokenizer.flow.decoder.CausalConditionalDecoder
|
60 |
in_channels: 320
|
61 |
out_channels: 80
|
62 |
+
channels: [256]
|
63 |
+
dropout: 0.0
|
64 |
attention_head_dim: 64
|
65 |
n_blocks: 4
|
66 |
num_mid_blocks: 12
|
67 |
num_heads: 8
|
68 |
act_fn: 'gelu'
|
69 |
+
static_chunk_size: !ref <chunk_size> * <token_mel_ratio>
|
70 |
+
num_decoding_left_chunks: !ref <num_decoding_left_chunks>
|
71 |
|
72 |
hift: !new:.audio_detokenizer.hifigan.generator.HiFTGenerator
|
73 |
in_channels: 80
|
|
|
77 |
nsf_alpha: 0.1
|
78 |
nsf_sigma: 0.003
|
79 |
nsf_voiced_threshold: 10
|
80 |
+
upsample_rates: [8, 5, 3]
|
81 |
+
upsample_kernel_sizes: [16, 11, 7]
|
82 |
istft_params:
|
83 |
n_fft: 16
|
84 |
hop_len: 4
|
85 |
resblock_kernel_sizes: [3, 7, 11]
|
86 |
resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
|
87 |
+
source_resblock_kernel_sizes: [7, 7, 11]
|
88 |
+
source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
|
89 |
lrelu_slope: 0.1
|
90 |
audio_limit: 0.99
|
91 |
f0_predictor: !new:.audio_detokenizer.hifigan.f0_predictor.ConvRNNF0Predictor
|
|
|
93 |
in_channels: 80
|
94 |
cond_channels: 512
|
95 |
|
96 |
+
|
97 |
feat_extractor: !name:matcha.utils.audio.mel_spectrogram
|
98 |
+
n_fft: 1920
|
99 |
num_mels: 80
|
100 |
sampling_rate: !ref <sample_rate>
|
101 |
+
hop_size: 480
|
102 |
+
win_size: 1920
|
103 |
fmin: 0
|
104 |
fmax: 8000
|
105 |
center: False
|
transformer/diffusion_pytorch_model-00002-of-00002.safetensors → talker/flow_stream.pt
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b907e9d1567f633d2015081ac170b2a9c367ea750a0456e5863344ee1cbe1aaf
|
3 |
+
size 1329720739
|
talker/{hift.pt → hift_v2.pt}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3386cc880324d4e98e05987b99107f49e40ed925b8ecc87c1f4939432d429879
|
3 |
+
size 83390254
|
talker/ossutil_output/ossutil_report_20250716_170346.report
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# ossutil64 cp -r /video_hy2/workspace/weilong.cwl/metax_models/bailingv4_moe_lite_addmetax_0716/ oss://moe-opensource-hy/multimodal/bailingv4_moe_lite_addmetax_0716/
|
talker/flow.pt → transformer/diffusion_pytorch_model-00001-of-00004.safetensors
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:02893d422480a4252bf2b7e46a2816b3a86596062aef6fb52e2316192b53f99b
|
3 |
+
size 2956851104
|
transformer/{diffusion_pytorch_model-00001-of-00002.safetensors → diffusion_pytorch_model-00002-of-00004.safetensors}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8d0c69da15b07b70b85ffc511df3991ffa30fb45e594e54c1d6f5576a5c7e321
|
3 |
+
size 2993322680
|
transformer/diffusion_pytorch_model-00003-of-00004.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bac5730f943ac8b9ff96f8a78fc6a22d1f4453027e96a3d937b8f8881c4200fd
|
3 |
+
size 2955511536
|
transformer/diffusion_pytorch_model-00004-of-00004.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:04babdf93bbe81237a78591d4c0396c0872946eee6a725da37adaac7574fd0be
|
3 |
+
size 2559296136
|
transformer/diffusion_pytorch_model.safetensors.index.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|