tanlonghua commited on
Commit
ac02922
·
1 Parent(s): e234d89

Upload Ming-Lite-Omni-1.5 safetensors

Browse files
talker/{audio_detokenizer.yaml → audio_detokenizer_stream.yaml} RENAMED
@@ -5,21 +5,30 @@ __set_seed3: !apply:torch.manual_seed [1986]
5
  __set_seed4: !apply:torch.cuda.manual_seed_all [1986]
6
 
7
  # fixed params
8
- sample_rate: 22050
9
- text_encoder_input_size: 512
10
- llm_input_size: 1024
11
- llm_output_size: 1024
12
  spk_embed_dim: 192
 
 
 
13
 
14
- flow: !new:.audio_detokenizer.flow.flow.MaskedDiffWithXvec
 
 
 
 
 
15
  input_size: 512
16
  output_size: 80
17
  spk_embed_dim: !ref <spk_embed_dim>
18
  output_type: 'mel'
19
  vocab_size: 4096
20
- input_frame_rate: 50
21
  only_mask_loss: True
22
- encoder: !new:.audio_detokenizer.transformer.encoder.ConformerEncoder
 
 
23
  output_size: 512
24
  attention_heads: 8
25
  linear_units: 2048
@@ -34,14 +43,11 @@ flow: !new:.audio_detokenizer.flow.flow.MaskedDiffWithXvec
34
  input_size: 512
35
  use_cnn_module: False
36
  macaron_style: False
37
- length_regulator: !new:.audio_detokenizer.flow.length_regulator.InterpolateRegulator
38
- channels: 80
39
- sampling_ratios: [1, 1, 1, 1]
40
- decoder: !new:.audio_detokenizer.flow.flow_matching.ConditionalCFM
41
  in_channels: 240
42
  n_spks: 1
43
  spk_emb_dim: 80
44
- tensorrt_model_path: 'bin/ckpt_300M/estimator_fp16.plan'
45
  cfm_params: !new:omegaconf.DictConfig
46
  content:
47
  sigma_min: 1e-06
@@ -50,16 +56,18 @@ flow: !new:.audio_detokenizer.flow.flow.MaskedDiffWithXvec
50
  training_cfg_rate: 0.2
51
  inference_cfg_rate: 0.7
52
  reg_loss_type: 'l1'
53
- estimator: !new:.audio_detokenizer.flow.decoder.ConditionalDecoder
54
  in_channels: 320
55
  out_channels: 80
56
- channels: [256, 256]
57
- dropout: 0
58
  attention_head_dim: 64
59
  n_blocks: 4
60
  num_mid_blocks: 12
61
  num_heads: 8
62
  act_fn: 'gelu'
 
 
63
 
64
  hift: !new:.audio_detokenizer.hifigan.generator.HiFTGenerator
65
  in_channels: 80
@@ -69,15 +77,15 @@ hift: !new:.audio_detokenizer.hifigan.generator.HiFTGenerator
69
  nsf_alpha: 0.1
70
  nsf_sigma: 0.003
71
  nsf_voiced_threshold: 10
72
- upsample_rates: [8, 8]
73
- upsample_kernel_sizes: [16, 16]
74
  istft_params:
75
  n_fft: 16
76
  hop_len: 4
77
  resblock_kernel_sizes: [3, 7, 11]
78
  resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
79
- source_resblock_kernel_sizes: [7, 11]
80
- source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
81
  lrelu_slope: 0.1
82
  audio_limit: 0.99
83
  f0_predictor: !new:.audio_detokenizer.hifigan.f0_predictor.ConvRNNF0Predictor
@@ -85,12 +93,13 @@ hift: !new:.audio_detokenizer.hifigan.generator.HiFTGenerator
85
  in_channels: 80
86
  cond_channels: 512
87
 
 
88
  feat_extractor: !name:matcha.utils.audio.mel_spectrogram
89
- n_fft: 1024
90
  num_mels: 80
91
  sampling_rate: !ref <sample_rate>
92
- hop_size: 256
93
- win_size: 1024
94
  fmin: 0
95
  fmax: 8000
96
  center: False
 
5
  __set_seed4: !apply:torch.cuda.manual_seed_all [1986]
6
 
7
  # fixed params
8
+ sample_rate: 24000
9
+ llm_input_size: 896
10
+ llm_output_size: 896
 
11
  spk_embed_dim: 192
12
+ qwen_pretrain_path: ''
13
+ token_frame_rate: 50
14
+ token_mel_ratio: 1
15
 
16
+ # stream related params
17
+ chunk_size: 50 # streaming inference chunk size, in token
18
+ num_decoding_left_chunks: -1 # streaming inference flow decoder left chunk size, <0 means use all left chunks
19
+
20
+
21
+ flow: !new:.audio_detokenizer.flow.flow.CausalMaskedDiffWithXvec
22
  input_size: 512
23
  output_size: 80
24
  spk_embed_dim: !ref <spk_embed_dim>
25
  output_type: 'mel'
26
  vocab_size: 4096
27
+ input_frame_rate: !ref <token_frame_rate>
28
  only_mask_loss: True
29
+ token_mel_ratio: !ref <token_mel_ratio>
30
+ pre_lookahead_len: 3
31
+ encoder: !new:.audio_detokenizer.transformer.upsample_encoder_new_mel.UpsampleConformerEncoder
32
  output_size: 512
33
  attention_heads: 8
34
  linear_units: 2048
 
43
  input_size: 512
44
  use_cnn_module: False
45
  macaron_style: False
46
+ static_chunk_size: !ref <chunk_size>
47
+ decoder: !new:.audio_detokenizer.flow.flow_matching.CausalConditionalCFM
 
 
48
  in_channels: 240
49
  n_spks: 1
50
  spk_emb_dim: 80
 
51
  cfm_params: !new:omegaconf.DictConfig
52
  content:
53
  sigma_min: 1e-06
 
56
  training_cfg_rate: 0.2
57
  inference_cfg_rate: 0.7
58
  reg_loss_type: 'l1'
59
+ estimator: !new:.audio_detokenizer.flow.decoder.CausalConditionalDecoder
60
  in_channels: 320
61
  out_channels: 80
62
+ channels: [256]
63
+ dropout: 0.0
64
  attention_head_dim: 64
65
  n_blocks: 4
66
  num_mid_blocks: 12
67
  num_heads: 8
68
  act_fn: 'gelu'
69
+ static_chunk_size: !ref <chunk_size> * <token_mel_ratio>
70
+ num_decoding_left_chunks: !ref <num_decoding_left_chunks>
71
 
72
  hift: !new:.audio_detokenizer.hifigan.generator.HiFTGenerator
73
  in_channels: 80
 
77
  nsf_alpha: 0.1
78
  nsf_sigma: 0.003
79
  nsf_voiced_threshold: 10
80
+ upsample_rates: [8, 5, 3]
81
+ upsample_kernel_sizes: [16, 11, 7]
82
  istft_params:
83
  n_fft: 16
84
  hop_len: 4
85
  resblock_kernel_sizes: [3, 7, 11]
86
  resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
87
+ source_resblock_kernel_sizes: [7, 7, 11]
88
+ source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
89
  lrelu_slope: 0.1
90
  audio_limit: 0.99
91
  f0_predictor: !new:.audio_detokenizer.hifigan.f0_predictor.ConvRNNF0Predictor
 
93
  in_channels: 80
94
  cond_channels: 512
95
 
96
+
97
  feat_extractor: !name:matcha.utils.audio.mel_spectrogram
98
+ n_fft: 1920
99
  num_mels: 80
100
  sampling_rate: !ref <sample_rate>
101
+ hop_size: 480
102
+ win_size: 1920
103
  fmin: 0
104
  fmax: 8000
105
  center: False
transformer/diffusion_pytorch_model-00002-of-00002.safetensors → talker/flow_stream.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e386f056ca05cf1b83af9360a79e7316552f41c984aa34b3b8c58fff5c7b52a2
3
- size 1473408512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b907e9d1567f633d2015081ac170b2a9c367ea750a0456e5863344ee1cbe1aaf
3
+ size 1329720739
talker/{hift.pt → hift_v2.pt} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:91e679b6ca1eff71187ffb4f3ab0444935594cdcc20a9bd12afad111ef8d6012
3
- size 81896716
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3386cc880324d4e98e05987b99107f49e40ed925b8ecc87c1f4939432d429879
3
+ size 83390254
talker/ossutil_output/ossutil_report_20250716_170346.report ADDED
@@ -0,0 +1 @@
 
 
1
+ # ossutil64 cp -r /video_hy2/workspace/weilong.cwl/metax_models/bailingv4_moe_lite_addmetax_0716/ oss://moe-opensource-hy/multimodal/bailingv4_moe_lite_addmetax_0716/
talker/flow.pt → transformer/diffusion_pytorch_model-00001-of-00004.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:21eae78c105b5e1c6c337b04f667843377651b4bcfb2d43247ed3ad7fd0a3470
3
- size 419900943
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02893d422480a4252bf2b7e46a2816b3a86596062aef6fb52e2316192b53f99b
3
+ size 2956851104
transformer/{diffusion_pytorch_model-00001-of-00002.safetensors → diffusion_pytorch_model-00002-of-00004.safetensors} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fecc17c89bb0d9ca82c3d0e9ea9ac079d066a073e8858a4545520c987322a19d
3
- size 9991573456
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d0c69da15b07b70b85ffc511df3991ffa30fb45e594e54c1d6f5576a5c7e321
3
+ size 2993322680
transformer/diffusion_pytorch_model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bac5730f943ac8b9ff96f8a78fc6a22d1f4453027e96a3d937b8f8881c4200fd
3
+ size 2955511536
transformer/diffusion_pytorch_model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04babdf93bbe81237a78591d4c0396c0872946eee6a725da37adaac7574fd0be
3
+ size 2559296136
transformer/diffusion_pytorch_model.safetensors.index.json CHANGED
The diff for this file is too large to render. See raw diff