update config

Browse files

Files changed (8) hide show

README.md +29 -1
config.json +8 -5
configuration_moss_ttsd.py +260 -0
modeling.py +426 -0
modeling_moss_ttsd.py +611 -0
processing_moss_ttsd.py +914 -0
processor_config.json +6 -0
tokenizer_config.json +10 -2

README.md CHANGED Viewed

@@ -22,4 +22,32 @@ MOSS-TTSD supports voice cloning and single-session speech generation of up to 9
 - **Two-Speaker Voice Cloning**: MOSS-TTSD supports zero-shot two speakers voice cloning and can generate conversational speech with accurate speaker swithcing based on dialogue scripts.
 - **Chinese-English Bilingual Support**: MOSS-TTSD enables highly expressive speech generation in both Chinese and English.
 - **Long-Form Speech Generation (up to 960 seconds)**: Thanks to low-bitrate codec and training framework optimization, MOSS-TTSD has been trained for long speech generation, enabling single-session speech generation of up to 960 seconds.
-- **Fully Open Source & Commercial-Ready**: MOSS-TTSD and its future updates will be fully open-source and support free commercial use.

 - **Two-Speaker Voice Cloning**: MOSS-TTSD supports zero-shot two speakers voice cloning and can generate conversational speech with accurate speaker swithcing based on dialogue scripts.
 - **Chinese-English Bilingual Support**: MOSS-TTSD enables highly expressive speech generation in both Chinese and English.
 - **Long-Form Speech Generation (up to 960 seconds)**: Thanks to low-bitrate codec and training framework optimization, MOSS-TTSD has been trained for long speech generation, enabling single-session speech generation of up to 960 seconds.
+- **Fully Open Source & Commercial-Ready**: MOSS-TTSD and its future updates will be fully open-source and support free commercial use.
+```python
+import os
+import torchaudio
+from transformers import AutoModel, AutoProcessor
+processor = AutoProcessor.from_pretrained("fnlp/MOSS-TTSD-v0.5", codec_path="fnlp/XY_Tokenizer_TTSD_V0_hf", trust_remote_code=True)
+model = AutoModel.from_pretrained("fnlp/MOSS-TTSD-v0.5", trust_remote_code=True, device_map="auto").eval()
+data = [{
+    "base_path": "/path/to/data/",
+    "text": "跟踪他们，他俩不行，从屋上平安下来没有扭伤脖子，",
+    "system_prompt": "你是一个根据文本生成对应音频的语音合成器。",
+    "prompt_text": "这支史诗级的美国迷幻摇滚乐队创建于，",
+    "prompt_audio": "prompt.wav",
+}]
+inputs = processor(data)
+token_ids = model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
+text, audios = processor.batch_decode(token_ids)
+if not os.path.exists("outputs/"):
+    os.mkdir("outputs/")
+for i, data in enumerate(audios):
+    for j, fragment in enumerate(data):
+        torchaudio.save(f"outputs/audio_{i}_{j}.wav", fragment.cpu(), 24000)
+```

config.json CHANGED Viewed

@@ -1,7 +1,11 @@
 {
-  "architectures": [
-    "AsteroidTTSModel"
-  ],
   "attention_bias": false,
   "attention_dropout": 0.0,
   "bos_token_id": 151643,
@@ -14,7 +18,6 @@
   "intermediate_size": 6144,
   "max_position_embeddings": 32768,
   "max_window_layers": 28,
-  "model_type": "qwen3",
   "num_attention_heads": 16,
   "num_hidden_layers": 28,
   "num_key_value_heads": 8,
@@ -30,7 +33,7 @@
   "speech_vocab_size": 1025,
   "tie_word_embeddings": true,
   "torch_dtype": "bfloat16",
-  "transformers_version": "4.51.3",
   "use_cache": true,
   "use_sliding_window": false,
   "vocab_size": 152697

 {
+  "model_type": "moss_ttsd",
+  "architectures": ["MossTTSDModel"],
+  "auto_map": {
+    "AutoProcessor": "processing_moss_ttsd.MossTTSDProcessor",
+    "AutoConfig": "configuration_moss_ttsd.MossTTSDConfig",
+    "AutoModel": "modeling_moss_ttsd.MossTTSDForCausalLM"
+  },
   "attention_bias": false,
   "attention_dropout": 0.0,
   "bos_token_id": 151643,
   "intermediate_size": 6144,
   "max_position_embeddings": 32768,
   "max_window_layers": 28,
   "num_attention_heads": 16,
   "num_hidden_layers": 28,
   "num_key_value_heads": 8,
   "speech_vocab_size": 1025,
   "tie_word_embeddings": true,
   "torch_dtype": "bfloat16",
+  "transformers_version": "4.53.2",
   "use_cache": true,
   "use_sliding_window": false,
   "vocab_size": 152697

configuration_moss_ttsd.py ADDED Viewed

	@@ -0,0 +1,260 @@

+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/asteroid/modular_asteroid.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_asteroid.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 OpenMOSS and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from transformers.configuration_utils import PretrainedConfig, layer_type_validation
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class MossTTSDConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MossTTSDModel`]. It is used to instantiate a
+    MOSS-TTSD model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the MOSS-TTSD
+    [fnlp/MOSS-TTSD-v0.5](https://huggingface.co/fnlp/MOSS-TTSD-v0.5) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Example:
+    ```python
+    >>> from transformers import MossTTSDConfig, MossTTSDModel
+    >>> # Initializing a MOSS-TTSD configuration
+    >>> configuration = MossTTSDConfig()
+    >>> # Initializing a model from the configuration
+    >>> model = MossTTSDModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    Args:
+            vocab_size (`int`, *optional*, defaults to 152697):
+                Vocabulary size of the MOSS-TTSD model. Defines the number of different tokens that can be represented by the
+                `inputs_ids` passed when calling [`MossTTSDModel`]
+            hidden_size (`int`, *optional*, defaults to 2048):
+                Dimension of the hidden representations.
+            intermediate_size (`int`, *optional*, defaults to 6144):
+                Dimension of the MLP representations.
+            num_hidden_layers (`int`, *optional*, defaults to 28):
+                Number of hidden layers in the Transformer encoder.
+            num_attention_heads (`int`, *optional*, defaults to 16):
+                Number of attention heads for each attention layer in the Transformer encoder.
+            num_key_value_heads (`int`, *optional*, defaults to 8):
+                This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+                `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+                `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+                converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+                by meanpooling all the original heads within that group. For more details, check out [this
+                paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
+            head_dim (`int`, *optional*, defaults to 128):
+                The attention head dimension.
+            hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+                The non-linear activation function (function or string) in the decoder.
+            max_position_embeddings (`int`, *optional*, defaults to 32768):
+                The maximum sequence length that this model might ever be used with.
+            initializer_range (`float`, *optional*, defaults to 0.02):
+                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+            rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+                The epsilon used by the rms normalization layers.
+            use_cache (`bool`, *optional*, defaults to `True`):
+                Whether or not the model should return the last key/values attentions (not used by all models). Only
+                relevant if `config.is_decoder=True`.
+            tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+                Whether the model's input and output word embeddings should be tied.
+            rope_theta (`float`, *optional*, defaults to 1000000.0):
+                The base period of the RoPE embeddings.
+            rope_scaling (`Dict`, *optional*):
+                Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+                and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+                accordingly.
+                Expected contents:
+                    `rope_type` (`str`):
+                        The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                        'llama3'], with 'default' being the original RoPE implementation.
+                    `factor` (`float`, *optional*):
+                        Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                        most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                        original maximum pre-trained length.
+                    `original_max_position_embeddings` (`int`, *optional*):
+                        Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                        pretraining.
+                    `attention_factor` (`float`, *optional*):
+                        Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                        computation. If unspecified, it defaults to value recommended by the implementation, using the
+                        `factor` field to infer the suggested value.
+                    `beta_fast` (`float`, *optional*):
+                        Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                        ramp function. If unspecified, it defaults to 32.
+                    `beta_slow` (`float`, *optional*):
+                        Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                        ramp function. If unspecified, it defaults to 1.
+                    `short_factor` (`list[float]`, *optional*):
+                        Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                        `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                        size divided by the number of attention heads divided by 2
+                    `long_factor` (`list[float]`, *optional*):
+                        Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                        `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                        size divided by the number of attention heads divided by 2
+                    `low_freq_factor` (`float`, *optional*):
+                        Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                    `high_freq_factor` (`float`, *optional*):
+                        Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+            attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+                Whether to use a bias in the query, key, value and output projection layers during self-attention.
+            use_sliding_window (`bool`, *optional*, defaults to `False`):
+                Whether to use sliding window attention.
+            sliding_window (`int`, *optional*, defaults to 4096):
+                Sliding window attention (SWA) window size. If not specified, will default to `4096`.
+            max_window_layers (`int`, *optional*, defaults to 28):
+                The number of layers using full attention. The first `max_window_layers` layers will use full attention, while any
+                additional layer afterwards will use SWA (Sliding Window Attention).
+            layer_types (`list`, *optional*):
+                Attention pattern for each layer.
+            attention_dropout (`float`, *optional*, defaults to 0.0):
+                The dropout ratio for the attention probabilities.
+            channels (`<fill_type>`, *optional*, defaults to 8): <fill_docstring>
+            speech_vocab_size (`<fill_type>`, *optional*, defaults to 1025): <fill_docstring>
+            speech_pad_token (`<fill_type>`, *optional*, defaults to 1024): <fill_docstring>
+            speech_token_range (`<fill_type>`, *optional*, defaults to `(151665, 152689)`): <fill_docstring>
+            speech_eos_token (`<fill_type>`, *optional*, defaults to 152694): <fill_docstring>
+    ```python
+    >>> from transformers import MossTTSDModel, MossTTSDConfig
+    >>> # Initializing a Qwen3 style configuration
+    >>> configuration = MossTTSDConfig()
+    >>> # Initializing a model from the Qwen3-8B style configuration
+    >>> model = MossTTSDModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "moss_ttsd"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `MossTTSD`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    def __init__(
+        self,
+        vocab_size=152697,
+        hidden_size=2048,
+        intermediate_size=6144,
+        num_hidden_layers=28,
+        num_attention_heads=16,
+        num_key_value_heads=8,
+        head_dim=128,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=True,
+        rope_theta=1000000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        use_sliding_window=False,
+        sliding_window=None,
+        max_window_layers=28,
+        layer_types=None,
+        attention_dropout=0.0,
+        channels=8,
+        speech_vocab_size=1025,
+        speech_pad_token=1024,
+        speech_token_range=(151665, 152689),
+        speech_eos_token=152694,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window if self.use_sliding_window else None
+        self.max_window_layers = max_window_layers
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+        self.layer_types = layer_types
+        if self.layer_types is None:
+            self.layer_types = [
+                "sliding_attention"
+                if self.sliding_window is not None and i >= self.max_window_layers
+                else "full_attention"
+                for i in range(self.num_hidden_layers)
+            ]
+        layer_type_validation(self.layer_types)
+        self.channels = channels
+        self.speech_vocab_size = speech_vocab_size
+        self.speech_pad_token = speech_pad_token
+        self.speech_token_range = speech_token_range
+        self.speech_eos_token = speech_eos_token
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+__all__ = ["MossTTSDConfig"]

modeling.py ADDED Viewed

	@@ -0,0 +1,426 @@

+import torch
+import torch.nn as nn
+from dataclasses import dataclass
+from transformers.utils import ModelOutput
+from transformers.cache_utils import Cache
+from typing import Optional, List, Tuple, Union
+from transformers.loss.loss_utils import ForCausalLMLoss
+from transformers.generation.streamers import BaseStreamer
+from transformers.modeling_outputs import BaseModelOutputWithPast
+from transformers.generation.configuration_utils import GenerationConfig
+from transformers.generation.stopping_criteria import StoppingCriteriaList
+from transformers import PreTrainedModel, GenerationMixin, Qwen3Config, Qwen3Model
+from transformers.generation.logits_process import (
+    LogitsProcessorList,
+    RepetitionPenaltyLogitsProcessor,
+    TopKLogitsWarper,
+    TopPLogitsWarper,
+    TemperatureLogitsWarper
+)
+class AsteroidTTSConfig(Qwen3Config):
+    def __init__(self,
+                channels = 8,
+                speech_pad_token = 1024,
+                speech_vocab_size = 1025,
+                speech_token_range = [],
+                **kwargs):
+        super().__init__(**kwargs)
+        self.channels = channels
+        self.speech_pad_token = speech_pad_token
+        self.speech_vocab_size = speech_vocab_size
+        self.speech_token_range = speech_token_range
+@dataclass
+class AsteroidTTSOutputWithPast(ModelOutput):
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    loss_all: Optional[Tuple[torch.FloatTensor]] = None
+    logits_all: Optional[Tuple[torch.FloatTensor]] = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+@dataclass
+class GenerateDecoderOnlyOutput(ModelOutput):
+    sequences: torch.LongTensor = None
+    scores: Optional[Tuple[torch.FloatTensor]] = None
+    logits: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None
+class AsteroidTTSPretrainedModel(PreTrainedModel):
+    config_class = AsteroidTTSConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Qwen3DecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    _supports_attention_backend = True
+class AsteroidTTSModel(AsteroidTTSPretrainedModel):
+    def __init__(self, config: AsteroidTTSConfig):
+        super().__init__(config)
+        self.text_pad_idx = config.pad_token_id
+        self.speech_pad_idx = config.speech_pad_token
+        self.embedding_list = nn.ModuleList([])
+        self.embedding_list.append(nn.Embedding(config.vocab_size, config.hidden_size, self.text_pad_idx))
+        # Channels 1 to channels-1: Speech tokens only
+        for _ in range(1, config.channels):
+            self.embedding_list.append(nn.Embedding(config.speech_vocab_size, config.hidden_size, self.speech_pad_idx))
+        self.language_model = Qwen3Model(config)
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embedding_list[0]
+    def set_input_embeddings(self, value: nn.Embedding):
+        self.embedding_list[0] = value
+    def _prepare_multi_modal_inputs(self, input_ids: torch.LongTensor) -> torch.FloatTensor:
+        """
+        Prepares multi-modal embeddings from input_ids of shape (batch_size, channels, sequence_length).
+        For channel 0: text + speech tokens, for channels 1 to channels-1: speech tokens padded with speech_pad_token.
+        """
+        batch_size, seq_length, channels = input_ids.shape
+        if channels != self.config.channels:
+            raise ValueError(f"Expected {self.config.channels} channels, got {channels}")
+        inputs_embeds = torch.zeros(batch_size, seq_length, self.config.hidden_size, device=input_ids.device, dtype=self.embedding_list[0].weight.dtype)
+        for i in range(channels):
+            embed_layer = self.embedding_list[i]
+            channel_input = input_ids[...,i]
+            inputs_embeds += embed_layer(channel_input)
+        return inputs_embeds
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,  # Shape: (batch_size, channels, sequence_length)
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if input_ids is not None:
+            inputs_embeds = self._prepare_multi_modal_inputs(input_ids)
+        outputs = self.language_model(
+            input_ids=None,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        return outputs
+class AsteroidTTSInstruct(AsteroidTTSPretrainedModel, GenerationMixin):
+    _tied_weights_keys = []
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    def __init__(self, config: AsteroidTTSConfig):
+        super().__init__(config)
+        self.model = AsteroidTTSModel(config)
+        self.channels = config.channels
+        self.weights = [1 for _ in range(self.channels)]
+        self._tied_weights_keys = [f"lm_heads.{i}.weight" for i in range(self.channels)]
+        self.vocab_size = config.vocab_size
+        self.lm_heads = nn.ModuleList([])
+        self.lm_heads.append(nn.Linear(config.hidden_size, config.vocab_size, bias=False))
+        for _ in range(1, config.channels):
+            self.lm_heads.append(nn.Linear(config.hidden_size, config.speech_vocab_size, bias=False))
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embedding_list[0]
+    def can_generate(self):
+        return True
+    def is_speech_token(self, tokens):
+        return (tokens >= self.config.speech_token_range[0]) & (tokens < self.config.speech_token_range[1])
+    def tie_weights(self):
+        for i in range(self.config.channels):
+            self._tie_or_clone_weights(self.lm_heads[i], self.model.embedding_list[i])
+    def set_input_embeddings(self, value):
+        self.model.embedding_list[0] = value
+    def get_output_embeddings(self):
+        return self.lm_heads[0]
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_heads[0] = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    def set_weights(self, weights):
+        self.weights = weights
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Union[Tuple, AsteroidTTSOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = outputs[0]
+        logits_all = [lm_head(hidden_states) for lm_head in self.lm_heads]
+        loss_all = torch.empty(self.channels, device=input_ids.device if not input_ids is None else inputs_embeds.device)
+        if labels is not None:
+            for i in range(self.config.channels):
+                vocab_size = self.config.vocab_size if i == 0 else self.config.speech_vocab_size
+                loss_all[i] = ForCausalLMLoss(logits_all[i], labels[..., i], vocab_size)
+        # total_weight = sum(self.weights)
+        # normalized_weights = [w / total_weight for w in self.weights]
+        normalized_weights = self.weights
+        total_loss = 0
+        for w, loss in zip(normalized_weights, loss_all):
+            total_loss += w * loss
+        if not return_dict:
+            output = (logits_all,) + outputs[1:]
+            return (total_loss, loss_all, ) + output if loss is not None else output
+        return AsteroidTTSOutputWithPast(
+            loss=total_loss,
+            logits=logits_all[0],
+            loss_all=loss_all,
+            logits_all=logits_all,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        output_only: bool = True,
+        **kwargs,
+    ):
+        batch_size, seq_len, channels = input_ids.shape
+        start_id = seq_len - channels + 1
+        outputs = super().generate(input_ids, **kwargs)
+        return_dict_in_generate = kwargs.get("return_dict_in_generate", False)
+        if return_dict_in_generate:
+            output_ids = outputs["sequences"]
+        else:
+            output_ids = outputs
+        if output_only:
+            output_ids = output_ids[:, start_id:]
+        if return_dict_in_generate:
+            outputs["sequences"] = output_ids
+        else:
+            outputs = output_ids
+        return outputs
+    def _sample(
+        self,
+        input_ids: torch.LongTensor,
+        logits_processor: LogitsProcessorList,
+        stopping_criteria: StoppingCriteriaList,
+        generation_config: GenerationConfig,
+        synced_gpus: bool,
+        streamer: Optional["BaseStreamer"],
+        **model_kwargs,
+    ) -> Union[GenerateDecoderOnlyOutput, torch.LongTensor]:
+        # 提取配置参数
+        speech_pad_idx = self.config.speech_pad_token
+        eos_token_id = generation_config.eos_token_id
+        output_attentions = generation_config.output_attentions
+        output_hidden_states = generation_config.output_hidden_states
+        output_scores = generation_config.output_scores
+        output_logits = generation_config.output_logits
+        return_dict_in_generate = generation_config.return_dict_in_generate
+        max_length = generation_config.max_length
+        has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria)
+        do_sample = generation_config.do_sample
+        # 初始化输出元组
+        scores = () if (return_dict_in_generate and output_scores) else None
+        raw_logits = () if (return_dict_in_generate and output_logits) else None
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+        # 初始化跟踪变量
+        batch_size, cur_len, channels = input_ids.shape  # channels = 8
+        this_peer_finished = False
+        unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
+        needs_additional_steps = -1 * torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
+        tf_inputs = input_ids[:]
+        input_ids = input_ids[:, :-(channels - 1)]
+        model_kwargs["attention_mask"] = model_kwargs["attention_mask"][:, :-(channels - 1)]
+        base_length = input_ids.shape[1]
+        model_kwargs = self._get_initial_cache_position(base_length, input_ids.device, model_kwargs)
+        # 定义logits processor
+        if generation_config.do_samples is not None:
+            do_samples = generation_config.do_samples
+            realprocessor = [LogitsProcessorList() for _ in range(channels)]
+            for i, layer_config in enumerate(generation_config.layers):
+                if layer_config.get("repetition_penalty") is not None:
+                    realprocessor[i].append(RepetitionPenaltyLogitsProcessor(penalty=layer_config.get("repetition_penalty")))
+                if layer_config.get("temperature") is not None:
+                    realprocessor[i].append(TemperatureLogitsWarper(temperature=layer_config.get("temperature")))
+                if layer_config.get("top_k") is not None:
+                    realprocessor[i].append(TopKLogitsWarper(top_k=layer_config.get("top_k")))
+                if layer_config.get("top_p") is not None:
+                    realprocessor[i].append(TopPLogitsWarper(top_p=layer_config.get("top_p")))
+        else:
+            do_samples = [do_sample for _ in range(channels)]
+            realprocessor = [logits_processor for _ in range(channels)]
+        while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
+            # 准备模型输入
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            model_inputs.update({"output_attentions": output_attentions} if output_attentions else {})
+            model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {})
+            # 前向传递
+            outputs = self(**model_inputs, return_dict=True)
+            model_kwargs = self._update_model_kwargs_for_generation(outputs, model_kwargs)
+            if synced_gpus and this_peer_finished:
+                continue
+            # 获取下一个 token 的 logits
+            next_token_logits = [logits[:, -1, :].clone().float().to(input_ids.device) for logits in outputs.logits_all]
+            for i, channel_logits in enumerate(next_token_logits):
+                if i != 0 and input_ids.shape[1] + 1 > tf_inputs.shape[1] - 7 + i:
+                    channel_logits[:, 1024] = - torch.inf
+                if i == 0 and input_ids.shape[1] + 1 <= tf_inputs.shape[1]:
+                    channel_logits[:, 152694] = - torch.inf
+            next_token_scores = [realprocessor[i](input_ids[..., i], logits) for i, logits in enumerate(next_token_logits)]
+            # 生成下一个 token
+            next_tokens = []
+            for i, channel_score in enumerate(next_token_scores):
+                if do_samples[i]:
+                    channel_ntk = torch.multinomial(nn.functional.softmax(channel_score, dim=-1), num_samples=1).squeeze(1)
+                elif not do_samples[i]:
+                    channel_ntk = torch.argmax(channel_score, dim=-1)
+                next_tokens.append(channel_ntk)
+            next_tokens = torch.stack(next_tokens, dim=-1)  # [batch_size, channels]
+            # 额外步骤逻辑
+            indices = (~self.is_speech_token(next_tokens[:, 0])) & (needs_additional_steps < 0)
+            needs_additional_steps[indices] = channels - 1  # 对于 8 个通道，需要 6 步
+            if input_ids.shape[1] + 1 <= tf_inputs.shape[1]:
+                i = input_ids.shape[1] + 1 - base_length
+                next_tokens[:, i:] = tf_inputs[:, input_ids.shape[1], i:]
+            # 在额外步骤中替换 token
+            mask = (needs_additional_steps > 0) & (needs_additional_steps < 7)
+            if mask.any().item():
+                next_tokens[mask, 0] = self.config.eos_token_id
+                for i in range(1, channels):
+                    mask_i = mask & (needs_additional_steps < channels - i)
+                    next_tokens[mask_i, i] = speech_pad_idx
+            if has_eos_stopping_criteria:
+                for i in range(channels):
+                    pddp = self.config.eos_token_id if i == 0 else speech_pad_idx
+                    next_tokens[:, i] = next_tokens[:, i] * unfinished_sequences + pddp * (1 - unfinished_sequences)
+            input_ids = torch.cat([input_ids, next_tokens[:, None, :]], dim=1)
+            if streamer is not None:
+                streamer.put(next_tokens[:, 0].cpu())
+            # 更新 unfinished_sequences
+            needs_additional_steps = torch.where(needs_additional_steps > 0, needs_additional_steps - 1, needs_additional_steps)
+            stopping = stopping_criteria(input_ids[..., 0], scores) | (needs_additional_steps == 0)
+            unfinished_sequences = unfinished_sequences & ~stopping
+            unfinished_sequences = unfinished_sequences | (needs_additional_steps > 0)
+            this_peer_finished = unfinished_sequences.max() == 0
+            if return_dict_in_generate:
+                if output_scores:
+                    scores += (next_token_scores,)
+                if output_logits:
+                    raw_logits += (next_token_logits,)
+                if output_attentions:
+                    decoder_attentions += (outputs.attentions,)
+                if output_hidden_states:
+                    decoder_hidden_states += (outputs.hidden_states,)
+            cur_len += 1
+            del outputs
+        if streamer is not None:
+            streamer.end()
+        if return_dict_in_generate:
+            return GenerateDecoderOnlyOutput(
+                sequences=input_ids,
+                scores=scores,
+                logits=raw_logits,
+                attentions=decoder_attentions,
+                hidden_states=decoder_hidden_states,
+                past_key_values=model_kwargs.get("past_key_values"),
+            )
+        else:
+            return input_ids

modeling_moss_ttsd.py ADDED Viewed

	@@ -0,0 +1,611 @@

+# coding=utf-8
+# Copyright 2025 OpenMOSS and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch MOSS-TTSD model."""
+from dataclasses import dataclass
+from typing import Optional, Union
+from transformers.cache_utils import Cache
+from transformers.generation import GenerationConfig, GenerationMixin, LogitsProcessorList, StoppingCriteriaList
+from transformers.generation.logits_process import (
+    RepetitionPenaltyLogitsProcessor,
+    TemperatureLogitsWarper,
+    TopKLogitsWarper,
+    TopPLogitsWarper,
+)
+from transformers.generation.streamers import BaseStreamer
+from transformers.generation.utils import GenerateDecoderOnlyOutput
+from transformers.loss.loss_utils import ForCausalLMLoss
+from transformers.modeling_outputs import BaseModelOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.models.qwen3.modeling_qwen3 import Qwen3Model
+from transformers.utils import ModelOutput, auto_docstring, is_torch_available
+from .configuration_moss_ttsd import MossTTSDConfig
+if is_torch_available():
+    import torch
+    import torch.nn as nn
+_CHECKPOINT_FOR_DOC = "fnlp/MOSS-TTSD-v0.5"
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for MOSS-TTSD outputs, with hidden states and attentions.
+    """
+)
+class MossTTSDOutputWithPast(ModelOutput):
+    """Base class for MOSS-TTSD outputs with past key values."""
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    loss_all: Optional[tuple[torch.FloatTensor, ...]] = None
+    logits_all: Optional[tuple[torch.FloatTensor, ...]] = None
+    past_key_values: Optional[tuple[tuple[torch.FloatTensor, ...], ...]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for MOSS-TTSD causal language model (or autoregressive) outputs.
+    """
+)
+class MossTTSDCausalLMOutputWithPast(ModelOutput):
+    r"""
+    Base class for MOSS-TTSD causal language model outputs.
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[Cache] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+class MossTTSDGenerationMixin(GenerationMixin):
+    """
+    Generation mixin for MossTTSD model with multi-channel support.
+    """
+    def _setup_channel_processors(
+        self, generation_config: GenerationConfig, channels: int
+    ) -> list[LogitsProcessorList]:
+        """Setup logits processors for each channel based on generation config."""
+        realprocessor = [LogitsProcessorList() for _ in range(channels)]
+        if hasattr(generation_config, "layers"):
+            for i, layer_config in enumerate(generation_config.layers):
+                if i >= channels:
+                    break
+                if layer_config.get("repetition_penalty") is not None:
+                    realprocessor[i].append(
+                        RepetitionPenaltyLogitsProcessor(penalty=layer_config.get("repetition_penalty"))
+                    )
+                if layer_config.get("temperature") is not None:
+                    realprocessor[i].append(TemperatureLogitsWarper(temperature=layer_config.get("temperature")))
+                if layer_config.get("top_k") is not None:
+                    realprocessor[i].append(TopKLogitsWarper(top_k=layer_config.get("top_k")))
+                if layer_config.get("top_p") is not None:
+                    realprocessor[i].append(TopPLogitsWarper(top_p=layer_config.get("top_p")))
+        return realprocessor
+    def _generate_next_tokens_with_scores(
+        self,
+        logits_all: tuple[torch.Tensor, ...],
+        input_ids: torch.LongTensor,
+        tf_inputs: torch.LongTensor,
+        channels: int,
+        realprocessor: list[LogitsProcessorList],
+        do_samples: list[bool],
+        speech_pad_idx: int,
+    ) -> tuple[torch.LongTensor, tuple[torch.Tensor, ...], tuple[torch.Tensor, ...]]:
+        """Generate next tokens for all channels with scores and logits."""
+        # Get next token logits
+        next_token_logits = tuple(logits[:, -1, :].clone().float().to(input_ids.device) for logits in logits_all)
+        # Apply channel-specific constraints
+        for i, channel_logits in enumerate(next_token_logits):
+            if i != 0 and input_ids.shape[1] + 1 > tf_inputs.shape[1] - 7 + i:
+                channel_logits[:, speech_pad_idx] = -torch.inf
+            if i == 0 and input_ids.shape[1] + 1 <= tf_inputs.shape[1]:
+                channel_logits[:, self.config.speech_eos_token] = -torch.inf
+        # Process logits
+        next_token_scores = tuple(
+            realprocessor[i](input_ids[..., i], logits) for i, logits in enumerate(next_token_logits)
+        )
+        # Sample or select tokens
+        next_tokens = []
+        for i, channel_score in enumerate(next_token_scores):
+            if do_samples[i]:
+                channel_ntk = torch.multinomial(nn.functional.softmax(channel_score, dim=-1), num_samples=1).squeeze(1)
+            else:
+                channel_ntk = torch.argmax(channel_score, dim=-1)
+            next_tokens.append(channel_ntk)
+        return torch.stack(next_tokens, dim=-1), next_token_scores, next_token_logits
+    def _process_multi_channel_tokens(
+        self,
+        next_tokens: torch.LongTensor,
+        needs_additional_steps: torch.LongTensor,
+        input_ids: torch.LongTensor,
+        tf_inputs: torch.LongTensor,
+        base_length: int,
+        channels: int,
+        eos_token_id: Optional[int],
+        speech_pad_idx: int,
+        unfinished_sequences: torch.LongTensor,
+        has_eos_stopping_criteria: bool,
+    ) -> tuple[torch.LongTensor, torch.LongTensor]:
+        """Process tokens for multi-channel TTS generation."""
+        # Additional steps logic
+        indices = (~self.is_speech_token(next_tokens[:, 0])) & (needs_additional_steps < 0)
+        needs_additional_steps[indices] = channels - 1  # For 8 channels, need 7 steps
+        if input_ids.shape[1] + 1 <= tf_inputs.shape[1]:
+            i = input_ids.shape[1] + 1 - base_length
+            next_tokens[:, i:] = tf_inputs[:, input_ids.shape[1], i:]
+        # Replace tokens in additional steps
+        mask = (needs_additional_steps > 0) & (needs_additional_steps < 7)
+        if mask.any().item():
+            next_tokens[mask, 0] = eos_token_id
+            for i in range(1, channels):
+                mask_i = mask & (needs_additional_steps < channels - i)
+                next_tokens[mask_i, i] = speech_pad_idx
+        if has_eos_stopping_criteria:
+            for i in range(channels):
+                pddp = eos_token_id if i == 0 else speech_pad_idx
+                next_tokens[:, i] = next_tokens[:, i] * unfinished_sequences + pddp * (1 - unfinished_sequences)
+        return next_tokens, needs_additional_steps
+    def _sample(
+        self,
+        input_ids: torch.LongTensor,
+        logits_processor: LogitsProcessorList,
+        stopping_criteria: StoppingCriteriaList,
+        generation_config: GenerationConfig,
+        synced_gpus: bool,
+        streamer: Optional[BaseStreamer],
+        **model_kwargs,
+    ) -> Union[GenerateDecoderOnlyOutput, torch.LongTensor]:
+        """Sample method for multi-channel TTS generation."""
+        # Extract configuration parameters
+        speech_pad_idx = getattr(self.config, "speech_pad_token", 1024)
+        eos_token_id = generation_config.eos_token_id
+        channels = getattr(self.config, "channels", 8)
+        # Generation config parameters
+        output_attentions = generation_config.output_attentions
+        output_hidden_states = generation_config.output_hidden_states
+        output_scores = generation_config.output_scores
+        output_logits = generation_config.output_logits
+        return_dict_in_generate = generation_config.return_dict_in_generate
+        has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria)
+        do_sample = generation_config.do_sample
+        # Initialize output tuples
+        scores = () if (return_dict_in_generate and output_scores) else None
+        raw_logits = () if (return_dict_in_generate and output_logits) else None
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+        # Initialize tracking variables
+        batch_size, cur_len, input_channels = input_ids.shape
+        this_peer_finished = False
+        unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
+        needs_additional_steps = -1 * torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
+        # Adjust input for generation
+        tf_inputs = input_ids.clone()
+        input_ids = input_ids[:, : -(channels - 1)]
+        cur_len = input_ids.shape[1]
+        model_kwargs["attention_mask"] = model_kwargs["attention_mask"][:, : -(channels - 1)]
+        base_length = input_ids.shape[1]
+        model_kwargs = self._get_initial_cache_position(cur_len, input_ids.device, model_kwargs)
+        # Setup logits processors and sampling config
+        if hasattr(generation_config, "do_samples") and generation_config.do_samples is not None:
+            do_samples = generation_config.do_samples
+            realprocessor = self._setup_channel_processors(generation_config, channels)
+        else:
+            do_samples = [do_sample for _ in range(channels)]
+            realprocessor = [logits_processor for _ in range(channels)]
+        while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
+            # Prepare model inputs
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            model_inputs.update({"output_attentions": output_attentions} if output_attentions else {})
+            model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {})
+            # Forward pass
+            outputs = self(**model_inputs, return_dict=True)
+            model_kwargs = self._update_model_kwargs_for_generation(outputs, model_kwargs)
+            if synced_gpus and this_peer_finished:
+                continue
+            # Generate next tokens for all channels
+            next_tokens, next_token_scores, next_token_logits = self._generate_next_tokens_with_scores(
+                outputs.logits_all, input_ids, tf_inputs, channels, realprocessor, do_samples, speech_pad_idx
+            )
+            # Process tokens for multi-channel TTS
+            next_tokens, needs_additional_steps = self._process_multi_channel_tokens(
+                next_tokens,
+                needs_additional_steps,
+                input_ids,
+                tf_inputs,
+                base_length,
+                channels,
+                eos_token_id,
+                speech_pad_idx,
+                unfinished_sequences,
+                has_eos_stopping_criteria,
+            )
+            input_ids = torch.cat([input_ids, next_tokens[:, None, :]], dim=1)
+            if streamer is not None:
+                streamer.put(next_tokens[:, 0].cpu())
+            # Update unfinished_sequences
+            needs_additional_steps = torch.where(
+                needs_additional_steps > 0, needs_additional_steps - 1, needs_additional_steps
+            )
+            stopping = stopping_criteria(input_ids[..., 0], scores) | (needs_additional_steps == 0)
+            unfinished_sequences = unfinished_sequences & ~stopping
+            unfinished_sequences = unfinished_sequences | (needs_additional_steps > 0)
+            this_peer_finished = unfinished_sequences.max() == 0
+            if return_dict_in_generate:
+                if output_scores:
+                    scores += (next_token_scores,)
+                if output_logits:
+                    raw_logits += (next_token_logits,)
+                if output_attentions:
+                    decoder_attentions += (outputs.attentions,)
+                if output_hidden_states:
+                    decoder_hidden_states += (outputs.hidden_states,)
+            cur_len += 1
+            del outputs
+        if streamer is not None:
+            streamer.end()
+        if return_dict_in_generate:
+            return GenerateDecoderOnlyOutput(
+                sequences=input_ids,
+                scores=scores,
+                logits=raw_logits,
+                attentions=decoder_attentions,
+                hidden_states=decoder_hidden_states,
+                past_key_values=model_kwargs.get("past_key_values"),
+            )
+        else:
+            return input_ids
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        output_only: bool = True,
+        **kwargs,
+    ):
+        batch_size, seq_len, channels = input_ids.shape
+        start_id = seq_len - channels + 1
+        outputs = super().generate(input_ids, **kwargs)
+        return_dict_in_generate = kwargs.get("return_dict_in_generate", False)
+        if return_dict_in_generate:
+            output_ids = outputs["sequences"]
+        else:
+            output_ids = outputs
+        if output_only:
+            output_ids = output_ids[:, start_id:]
+        if return_dict_in_generate:
+            outputs["sequences"] = output_ids
+        else:
+            outputs = output_ids
+        return outputs
+class MossTTSDPretrainedModel(PreTrainedModel):
+    """Base class for MOSS-TTSD pretrained models."""
+    config_class = MossTTSDConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Qwen3DecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    _supports_attention_backend = True
+class MossTTSDModel(MossTTSDPretrainedModel):
+    """MOSS-TTSD model for text-to-speech synthesis."""
+    def __init__(self, config: MossTTSDConfig):
+        super().__init__(config)
+        self.text_pad_idx = config.pad_token_id
+        self.speech_pad_idx = config.speech_pad_token
+        self.embedding_list = nn.ModuleList([])
+        self.embedding_list.append(nn.Embedding(config.vocab_size, config.hidden_size, self.text_pad_idx))
+        # Channels 1 to channels-1: Speech tokens only
+        for _ in range(1, config.channels):
+            self.embedding_list.append(nn.Embedding(config.speech_vocab_size, config.hidden_size, self.speech_pad_idx))
+        self.language_model = Qwen3Model(config)
+        self.post_init()
+    def get_input_embeddings(self):
+        """Get the input embeddings for the model."""
+        return self.embedding_list[0]
+    def set_input_embeddings(self, value: nn.Embedding):
+        """Set the input embeddings for the model."""
+        self.embedding_list[0] = value
+    def _prepare_multi_modal_inputs(self, input_ids: torch.LongTensor) -> torch.FloatTensor:
+        """
+        Prepare multi-modal embeddings from input_ids of shape (batch_size, channels, sequence_length).
+        For channel 0: text + speech tokens, for channels 1 to channels-1: speech tokens padded with speech_pad_token.
+        """
+        batch_size, seq_length, channels = input_ids.shape
+        if channels != self.config.channels:
+            raise ValueError(f"Expected {self.config.channels} channels, got {channels}")
+        inputs_embeds = torch.zeros(
+            batch_size,
+            seq_length,
+            self.config.hidden_size,
+            device=input_ids.device,
+            dtype=self.embedding_list[0].weight.dtype,
+        )
+        for i in range(channels):
+            embed_layer = self.embedding_list[i]
+            channel_input = input_ids[..., i]
+            inputs_embeds += embed_layer(channel_input)
+        return inputs_embeds
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[list[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Union[tuple, BaseModelOutputWithPast]:
+        """Forward pass for MOSS-TTSD model."""
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if input_ids is not None:
+            inputs_embeds = self._prepare_multi_modal_inputs(input_ids)
+        return self.language_model(
+            input_ids=None,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+class MossTTSDForCausalLM(MossTTSDPretrainedModel, MossTTSDGenerationMixin):
+    """MOSS-TTSD model for causal language modeling with multi-channel support."""
+    _tied_weights_keys = []
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    def __init__(self, config: MossTTSDConfig):
+        super().__init__(config)
+        self.model = MossTTSDModel(config)
+        self.channels = config.channels
+        self.weights = [1 for _ in range(self.channels)]
+        self._tied_weights_keys = [f"lm_heads.{i}.weight" for i in range(self.channels)]
+        self.vocab_size = config.vocab_size
+        self.lm_heads = nn.ModuleList([])
+        self.lm_heads.append(nn.Linear(config.hidden_size, config.vocab_size, bias=False))
+        for _ in range(1, config.channels):
+            self.lm_heads.append(nn.Linear(config.hidden_size, config.speech_vocab_size, bias=False))
+        self.post_init()
+    def get_input_embeddings(self):
+        """Get the input embeddings for the model."""
+        return self.model.embedding_list[0]
+    def can_generate(self):
+        """Check if the model can generate."""
+        return True
+    def is_speech_token(self, tokens: torch.Tensor) -> torch.Tensor:
+        """Check if tokens are speech tokens."""
+        return (tokens >= self.config.speech_token_range[0]) & (tokens < self.config.speech_token_range[1])
+    def tie_weights(self):
+        """Tie the weights between input embeddings and output embeddings."""
+        for i in range(self.config.channels):
+            self._tie_or_clone_weights(self.lm_heads[i], self.model.embedding_list[i])
+    def set_input_embeddings(self, value: nn.Embedding):
+        """Set the input embeddings for the model."""
+        self.model.embedding_list[0] = value
+    def get_output_embeddings(self):
+        """Get the output embeddings for the model."""
+        return self.lm_heads[0]
+    def set_output_embeddings(self, new_embeddings: nn.Linear):
+        """Set the output embeddings for the model."""
+        self.lm_heads[0] = new_embeddings
+    def set_decoder(self, decoder: MossTTSDModel):
+        """Set the decoder for the model."""
+        self.model = decoder
+    def get_decoder(self):
+        """Get the decoder for the model."""
+        return self.model
+    def set_weights(self, weights: list[float]):
+        """Set the weights for different channels."""
+        self.weights = weights
+    def _compute_loss(
+        self, hidden_states: torch.Tensor, labels: torch.LongTensor, skip_logits: bool, **kwargs
+    ) -> tuple[torch.Tensor, torch.Tensor, Optional[tuple[torch.Tensor, ...]]]:
+        """Compute loss for all channels."""
+        device = hidden_states.device
+        loss_all = torch.empty(self.channels, device=device)
+        logits_list = []
+        for i in range(self.config.channels):
+            vocab_size = self.config.vocab_size if i == 0 else self.config.speech_vocab_size
+            logits = self.lm_heads[i](hidden_states)
+            loss_all[i] = ForCausalLMLoss(logits, labels[..., i], vocab_size)
+            if not skip_logits:
+                logits_list.append(logits)
+        logits_all = tuple(logits_list) if logits_list else None
+        # Compute weighted total loss
+        total_weight = sum(self.weights)
+        normalized_weights = [w / total_weight for w in self.weights]
+        total_loss = sum(w * loss for w, loss in zip(normalized_weights, loss_all))
+        return total_loss, loss_all, logits_all
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        skip_logits: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple, MossTTSDOutputWithPast]:
+        """Forward pass for MOSS-TTSD causal language model."""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        skip_logits = skip_logits if skip_logits is not None else (self.training and labels is not None)
+        if skip_logits and labels is None:
+            skip_logits = False
+        # Decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = outputs[0]
+        logits_all = None
+        loss_all = None
+        total_loss = None
+        if labels is not None:
+            total_loss, loss_all, logits_all = self._compute_loss(hidden_states, labels, skip_logits, **kwargs)
+        else:
+            logits_all = [lm_head(hidden_states) for lm_head in self.lm_heads]
+            total_loss = None
+            loss_all = None
+        if not return_dict:
+            output = (logits_all,) + outputs[1:]
+            return (
+                (
+                    total_loss,
+                    loss_all,
+                )
+                + output
+                if total_loss is not None
+                else output
+            )
+        return MossTTSDOutputWithPast(
+            loss=total_loss,
+            logits=logits_all[0] if logits_all is not None else None,
+            loss_all=loss_all,
+            logits_all=logits_all,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+__all__ = ["MossTTSDModel", "MossTTSDForCausalLM"]

processing_moss_ttsd.py ADDED Viewed

	@@ -0,0 +1,914 @@

+# coding=utf-8
+# Copyright 2025 OpenMOSS and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for MOSS-TTSD.
+"""
+from __future__ import annotations
+import math
+import os
+import re
+from dataclasses import asdict, dataclass
+from typing import Any, Callable, Optional, Union
+import numpy as np
+from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from transformers.tokenization_utils_base import BatchEncoding
+from transformers.utils import is_torch_available, is_torchaudio_available
+from transformers import AutoFeatureExtractor, AutoTokenizer, AutoModel
+#from transformers.models.xy_tokenizer.modeling_xy_tokenizer import XYTokenizer
+if is_torch_available():
+    import torch
+if is_torchaudio_available():
+    import torchaudio
+class MossTTSDProcessorKwargs(ProcessingKwargs, total=False):
+    """
+    Arguments for configuring MOSS-TTSD processing operations.
+    Inherits from ProcessingKwargs and provides structured configuration for text and audio processing.
+    """
+    _defaults = {
+        "text_kwargs": {
+            "pad_token_id": 0,  # Fallback pad token ID, actual value comes from tokenizer.pad_token_id
+        },
+        "audio_kwargs": {
+            "max_channels": 8,  # Maximum number of quantization channels
+            "audio_pad_token_id": 1024,  # Padding token ID for non-text channels
+            "silence_duration": 0.0,  # Duration of silence to append for encoder segmentation
+            "input_sample_rate": 16000,  # Input audio sampling rate (fallback, inferred from audio_tokenizer.config)
+            "encoder_downsample_rate": 320,  # Encoder downsampling rate (fallback, inferred from audio_tokenizer.config)
+            "speech_token_range": [151665, 152689],  # Token range for speech tokens (first codebook offset mapping)
+            "audio_bos_token": "<|begin_of_speech|>",
+            "audio_eos_token": "<|end_of_speech|>",
+        },
+        "common_kwargs": {
+            "return_tensors": "pt",
+            "padding": True,
+            "use_normalize": False,
+        },
+    }
+@dataclass
+class MossTTSDChatSample:
+    """
+    Intermediate representation of a single sample with T×C grid layout and metadata.
+    Args:
+        input_ids_2d (`torch.LongTensor`):
+            Shape (T, C) tensor where column 0 contains text tokens and columns 1..C-1 contain
+            quantized audio codebooks (or padding token 1024 for empty slots).
+        label_ids_2d (`torch.LongTensor`, *optional*):
+            Optional label tensor for training, same shape as input_ids_2d.
+        meta (`dict`):
+            Dictionary containing metadata for debugging and tracking purposes.
+    """
+    input_ids_2d: "torch.LongTensor"
+    label_ids_2d: Optional["torch.LongTensor"]
+    meta: dict
+@dataclass
+class MossTTSDBatchInput:
+    """
+    Batched input tensors for MOSS-TTSD model.
+    Args:
+        input_ids (`torch.LongTensor`):
+            Shape (B, T, C) tensor containing batched input token IDs.
+        attention_mask (`torch.LongTensor`):
+            Shape (B, T) tensor containing attention mask for valid tokens.
+        labels (`torch.LongTensor`, *optional*):
+            Optional shape (B, T, C) tensor containing label token IDs for training.
+    """
+    input_ids: "torch.LongTensor"
+    attention_mask: "torch.LongTensor"
+    labels: Optional["torch.LongTensor"]
+@dataclass
+class MossTTSDResponse:
+    """
+    Unified response container for MOSS-TTSD inference outputs.
+    Args:
+        audio (`np.ndarray`, *optional*):
+            Optional numpy array containing generated audio waveform.
+        generated_text (`str`, *optional*, defaults to `""`):
+            String containing generated text output.
+        sampling_rate (`int`, *optional*):
+            Optional integer specifying the sampling rate of the generated audio.
+    """
+    audio: Optional[np.ndarray] = None
+    generated_text: str = ""
+    sampling_rate: Optional[int] = None
+class MossTTSDSampleProcessor:
+    """
+    Sample-level processor for MOSS-TTSD that handles individual sample processing without batch padding.
+    This class handles per-sample processing logic:
+    - Parses JSONL items (text/prompt_text/prompt_audio)
+    - Optional text normalization
+    - Audio loading/resampling/merging, feature extraction and encoding
+    - Generates T×C grid and performs multi-channel shifting
+    Args:
+        tokenizer (`AutoTokenizer`):
+            The text tokenizer for encoding text tokens.
+        feature_extractor (`AutoFeatureExtractor`, *optional*):
+            Optional feature extractor for audio preprocessing.
+        audio_tokenizer (`AutoModel`, *optional*):
+            Optional audio tokenizer for audio encoding/decoding.
+        chat_template (`str`, *optional*):
+            Optional chat template string for conversation formatting.
+        speech_token_range (`List[int]`):
+            List of [start, end] token IDs for speech token mapping.
+        audio_bos_token (`str`):
+            Beginning of speech token string.
+        audio_eos_token (`str`):
+            End of speech token string.
+        audio_pad_token_id (`int`):
+            Padding token ID for audio channels.
+        max_channels (`int`):
+            Maximum number of quantization channels.
+        input_sample_rate (`int`):
+            Target sample rate for input audio.
+        encoder_downsample_rate (`int`):
+            Downsampling rate of the audio encoder.
+    """
+    def __init__(
+        self,
+        tokenizer,
+        feature_extractor: Optional = None,
+        audio_tokenizer: Optional = None,
+        *,
+        chat_template: Optional[str],
+        speech_token_range: list[int],
+        audio_bos_token: str,
+        audio_eos_token: str,
+        audio_pad_token_id: int,
+        max_channels: int,
+        input_sample_rate: int,
+        encoder_downsample_rate: int,
+    ) -> None:
+        self.tokenizer = tokenizer
+        self.feature_extractor = feature_extractor
+        self.audio_tokenizer = audio_tokenizer
+        self.chat_template = chat_template
+        self.speech_token_range = speech_token_range
+        self.audio_bos_token = audio_bos_token
+        self.audio_eos_token = audio_eos_token
+        self.audio_pad_token_id = audio_pad_token_id
+        self.max_channels = max_channels
+        self.input_sample_rate = input_sample_rate
+        self.encoder_downsample_rate = encoder_downsample_rate
+    def prepare_sample(
+        self,
+        item: dict[str, Any],
+        *,
+        apply_chat_template: Callable[[str, dict], str],
+        use_normalize: bool = False,
+        silence_duration: float = 0.0,
+        **kwargs,
+    ) -> MossTTSDChatSample:
+        """
+        Prepare a single sample from JSONL item into MossTTSDChatSample format.
+        Args:
+            item (`dict`):
+                Dictionary containing the input data (text, prompt_audio, etc.).
+            apply_chat_template (`callable`):
+                Function to apply chat template formatting.
+            use_normalize (`bool`, *optional*, defaults to `False`):
+                Whether to apply text normalization.
+            silence_duration (`float`, *optional*, defaults to `0.0`):
+                Duration of silence to append to audio for encoder segmentation.
+            **kwargs:
+                Additional keyword arguments passed to chat template.
+        Returns:
+            `MossTTSDChatSample`: Processed sample with 2D input tensor and metadata.
+        """
+        processed = self._process_jsonl_item(item)
+        system_prompt = item.get("system_prompt")
+        if isinstance(system_prompt, str):
+            kwargs["system_prompt"] = system_prompt
+        full_text = (processed["prompt_text"] or "") + processed["text"]
+        original_full_text = full_text
+        if use_normalize:
+            full_text = self._normalize_text(full_text)
+        final_text = full_text.replace("[S1]", "<speaker1>").replace("[S2]", "<speaker2>")
+        # Load and resample audio (may be None)
+        wav = self._process_audio_data(processed["prompt_audio"], target_sample_rate=self.input_sample_rate)
+        # Assemble into grid (T, C)
+        inputs_2d = self._build_inputs(
+            text=final_text,
+            audio_data=wav,
+            apply_chat_template=apply_chat_template,
+            silence_duration=silence_duration,
+            **kwargs,
+        )
+        inputs_2d = self._shift_inputs(inputs_2d, pad_token_id=self.tokenizer.pad_token_id, max_channels=self.max_channels)
+        meta = {
+            "original_text": original_full_text,
+            "normalized_text": self._normalize_text(original_full_text) if use_normalize else None,
+            "final_text": final_text,
+            "use_normalize": use_normalize,
+        }
+        ids_t = torch.tensor(inputs_2d, dtype=torch.long)
+        return MossTTSDChatSample(input_ids_2d=ids_t, label_ids_2d=None, meta=meta)
+    def collate(
+        self,
+        samples: list[MossTTSDChatSample],
+        *,
+        pad_token_id: int,
+        audio_pad_token_id: int,
+    ) -> MossTTSDBatchInput:
+        """
+        Collate multiple samples into a batch with proper padding.
+        Args:
+            samples (`List[MossTTSDChatSample]`):
+                List of MossTTSDChatSample objects to collate.
+            pad_token_id (`int`):
+                Padding token ID for text tokens.
+            audio_pad_token_id (`int`):
+                Padding token ID for audio tokens.
+        Returns:
+            `MossTTSDBatchInput`: Batched input with padded tensors.
+        """
+        assert is_torch_available(), "PyTorch is required for collation."
+        ids_list = [s.input_ids_2d for s in samples]
+        labels_list = [s.label_ids_2d for s in samples]
+        C = ids_list[0].shape[1]
+        max_len = max(x.shape[0] for x in ids_list)
+        padded_ids, padded_labels, padded_attn = [], [], []
+        for ids, labels in zip(ids_list, labels_list):
+            pad_len = max_len - ids.shape[0]
+            pad_grid = torch.full((pad_len, C), audio_pad_token_id, dtype=torch.long)
+            pad_grid[:, 0] = pad_token_id  # Text column uses tokenizer pad
+            ids_padded = torch.cat([pad_grid, ids], dim=0)
+            padded_ids.append(ids_padded)
+            attn = torch.ones(ids.shape[0], dtype=torch.long)
+            a_pad = torch.zeros(pad_len, dtype=torch.long)
+            padded_attn.append(torch.cat([a_pad, attn], dim=0))
+            if labels is None:
+                padded_labels.append(None)
+            else:
+                lab_pad = torch.full((pad_len, C), audio_pad_token_id, dtype=torch.long)
+                lab_pad[:, 0] = -100  # Text labels are ignored by default
+                padded_labels.append(torch.cat([lab_pad, labels], dim=0))
+        input_ids = torch.stack(padded_ids)  # (B, T, C)
+        attention_mask = torch.stack(padded_attn)  # (B, T)
+        labels = torch.stack([l if l is not None else torch.full_like(input_ids[0], -100) for l in padded_labels]) \
+                 if any(l is not None for l in padded_labels) else None
+        return MossTTSDBatchInput(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
+    @staticmethod
+    def _process_jsonl_item(item: dict[str, Any]) -> dict[str, Any]:
+        """
+        Process a JSONL item to extract text and audio data.
+        Supports both single-speaker and multi-speaker formats:
+        - Single: {"prompt_audio": path, "prompt_text": text}
+        - Multi: {"prompt_audio_speaker1": path1, "prompt_text_speaker1": text1, ...}
+        Args:
+            item: Dictionary containing the JSONL item data.
+        Returns:
+            Dictionary with extracted "text", "prompt_text", and "prompt_audio" fields.
+        """
+        base_path = item.get("base_path", "")
+        text = item.get("text", "")
+        prompt_audio = None
+        prompt_text = ""
+        if "prompt_audio" in item and "prompt_text" in item:
+            pa = item["prompt_audio"]
+            if pa:
+                prompt_audio = os.path.join(base_path, pa) if isinstance(pa, str) and base_path else pa
+                prompt_text = item.get("prompt_text", "")
+        else:
+            pa1, pt1 = item.get("prompt_audio_speaker1", ""), item.get("prompt_text_speaker1", "")
+            pa2, pt2 = item.get("prompt_audio_speaker2", ""), item.get("prompt_text_speaker2", "")
+            has1 = (isinstance(pa1, str) and pa1) or isinstance(pa1, tuple)
+            has2 = (isinstance(pa2, str) and pa2) or isinstance(pa2, tuple)
+            if has1 or has2:
+                spk1 = os.path.join(base_path, pa1) if isinstance(pa1, str) and base_path and pa1 else pa1
+                spk2 = os.path.join(base_path, pa2) if isinstance(pa2, str) and base_path and pa2 else pa2
+                prompt_audio = {"speaker1": spk1, "speaker2": spk2}
+            tmp = ""
+            if pt1:
+                tmp += f"[S1]{pt1}"
+            if pt2:
+                tmp += f"[S2]{pt2}"
+            prompt_text = tmp.strip()
+        return {"text": text, "prompt_text": prompt_text, "prompt_audio": prompt_audio}
+    @staticmethod
+    def _normalize_text(text: str) -> str:
+        """
+        Normalize text by applying various transformations for TTS processing.
+        Performs speaker tag conversion, punctuation normalization, laughter conversion,
+        and other text cleaning operations suitable for speech synthesis.
+        Args:
+            text: Input text string to normalize.
+        Returns:
+            Normalized text string.
+        """
+        text = re.sub(r"\[(\d+)\]", r"[S\1]", text)
+        remove_chars = '【】《》（）『』「」"-""～~'
+        text = re.sub(r"\[(?!S\d+\])([^\]]*)\]", r"\1", text)
+        segments = re.split(r"(?=\[S\d+\])", text.replace("\n", " "))
+        out = []
+        for seg in segments:
+            seg = seg.strip()
+            if not seg:
+                continue
+            m = re.match(r"^(\[S\d+\])\s*(.*)", seg)
+            tag, content = m.groups() if m else ("", seg)
+            content = re.sub(f"[{re.escape(remove_chars)}]", "", content)
+            content = re.sub(r"哈{2,}", "(笑)", content)
+            content = re.sub(r"\b(ha(\s*ha)+)\b", "(laughs)", content, flags=re.IGNORECASE)
+            content = content.replace("——", "，").replace("……", "，")
+            trans = str.maketrans({"！": "，", "!": ",", "；": "，", ";": ",", "：": "，", ":": ",", "、": "，", "？": "，", "?": ","})
+            content = content.translate(trans).strip()
+            if len(content) > 1:
+                last = "。" if content[-1] == "，" else ("." if content[-1] == "," else content[-1])
+                body = content[:-1].replace("。", "，")
+                content = body + last
+            out.append(f"{tag}{content}".strip())
+        return "".join(out)
+    @staticmethod
+    def _load_single_audio(audio_input: Union[str, tuple["torch.Tensor", int]]):
+        """
+        Load audio from file path or tensor tuple.
+        Args:
+            audio_input: Either a file path string or a tuple of (tensor, sample_rate).
+        Returns:
+            Tuple of (audio_tensor, sample_rate).
+        Raises:
+            ValueError: If audio input format is unsupported.
+        """
+        if isinstance(audio_input, tuple) and len(audio_input) == 2:
+            return audio_input
+        if isinstance(audio_input, str):
+            try:
+                return torchaudio.load(audio_input)
+            except Exception:
+                import soundfile as sf  # type: ignore
+                data, sr = sf.read(audio_input, always_2d=True)
+                data_t = torch.from_numpy(np.transpose(data))  # (C, T)
+                return data_t, int(sr)
+        raise ValueError(f"Unsupported audio input format: {type(audio_input)}")
+    @staticmethod
+    def _resample(audio: "torch.Tensor", sr: int, target_sr: int) -> tuple["torch.Tensor", int]:
+        """
+        Resample audio to target sample rate and convert to mono if needed.
+        Args:
+            audio: Input audio tensor with shape (channels, time).
+            sr: Current sample rate.
+            target_sr: Target sample rate.
+        Returns:
+            Tuple of (resampled_audio, target_sr) where audio is mono with shape (1, time).
+        """
+        if sr != target_sr:
+            audio = torchaudio.functional.resample(audio, sr, target_sr)
+        if audio.shape[0] > 1:
+            audio = audio.mean(dim=0, keepdim=True)
+        if audio.ndim == 1:
+            audio = audio.unsqueeze(0)
+        return audio, target_sr
+    @classmethod
+    def _load_audio_data(
+        cls, audio_input: Union[str, tuple["torch.Tensor", int]], target_sample_rate: int
+    ) -> tuple["torch.Tensor", int]:
+        """
+        Load and resample audio data to target sample rate.
+        Args:
+            audio_input: Audio file path or tensor tuple.
+            target_sample_rate: Target sample rate for resampling.
+        Returns:
+            Tuple of (audio_tensor, target_sample_rate).
+        """
+        audio, sr = cls._load_single_audio(audio_input)
+        return cls._resample(audio, sr, target_sample_rate)
+    @classmethod
+    def _merge_speaker_audios(
+        cls,
+        wav1: Union[str, tuple["torch.Tensor", int]],
+        wav2: Union[str, tuple["torch.Tensor", int]],
+        target_sample_rate: int,
+    ) -> "torch.Tensor":
+        """
+        Merge two speaker audio inputs by concatenation.
+        Args:
+            wav1: Audio input for speaker 1.
+            wav2: Audio input for speaker 2.
+            target_sample_rate: Target sample rate for both audio inputs.
+        Returns:
+            Concatenated audio tensor.
+        """
+        a1, _ = cls._load_audio_data(wav1, target_sample_rate)
+        a2, _ = cls._load_audio_data(wav2, target_sample_rate)
+        return torch.cat([a1, a2], dim=1)
+    @classmethod
+    def _process_audio_data(
+        cls, prompt_audio: Optional[Union[str, dict[str, Any], tuple["torch.Tensor", int]]], target_sample_rate: int
+    ) -> Optional["torch.Tensor"]:
+        """
+        Process audio data from various input formats.
+        Handles single audio files, multi-speaker audio dictionaries, or None input.
+        Args:
+            prompt_audio: Audio input in various formats (path, dict, tensor tuple, or None).
+            target_sample_rate: Target sample rate for processing.
+        Returns:
+            Processed audio tensor or None if no audio provided.
+        """
+        if prompt_audio is None:
+            return None
+        if isinstance(prompt_audio, dict) and "speaker1" in prompt_audio and "speaker2" in prompt_audio:
+            return cls._merge_speaker_audios(prompt_audio["speaker1"], prompt_audio["speaker2"], target_sample_rate)
+        wav, _ = cls._load_audio_data(prompt_audio, target_sample_rate)
+        return wav
+    def _build_inputs(
+        self,
+        text: str,
+        audio_data: Optional["torch.Tensor"],
+        apply_chat_template: Callable[[str, dict], str],
+        silence_duration: float,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Build input grid from text and optional audio data.
+        Creates a TxC grid where column 0 contains text tokens and columns 1..C-1 contain
+        quantized audio codebook tokens. Audio tokens are mapped to speech token range.
+        Args:
+            text: Input text string to process.
+            audio_data: Optional audio tensor with shape (channels, time).
+            apply_chat_template: Function to apply chat template formatting.
+            silence_duration: Duration of silence to append for encoder segmentation.
+            **kwargs: Additional arguments for chat template.
+        Returns:
+            NumPy array with shape (T, max_channels) containing the input grid.
+        """
+        assert isinstance(text, str), "text must be a string"
+        prompt = apply_chat_template(text, kwargs)
+        text_ids = np.array(self.tokenizer.encode(prompt, add_special_tokens=False))
+        grid = np.full((text_ids.shape[0], self.max_channels), self.audio_pad_token_id, dtype=np.int64)
+        grid[:, 0] = text_ids
+        if audio_data is not None:
+            silence_samples = int(max(0.0, silence_duration) * self.input_sample_rate)
+            silence = torch.zeros(audio_data.shape[0], silence_samples, device=audio_data.device)
+            wav = torch.cat([audio_data, silence], dim=1)
+            feat = self.feature_extractor(
+                wav, sampling_rate=self.input_sample_rate, return_attention_mask=True, return_tensors="pt"
+            )
+            with torch.no_grad():
+                enc = self.audio_tokenizer.encode(feat)
+                # (time, codebooks)
+                audio_codes = enc["audio_codes"][:, 0].permute(1, 0).cpu().numpy()
+            # Map first codebook to speech token range
+            audio_codes[:, 0] = audio_codes[:, 0] + self.speech_token_range[0]
+            grid = np.concatenate([grid, audio_codes], axis=0)
+            # Trim silence tokens at the end based on encoder downsampling
+            silence_tokens = silence_duration * self.input_sample_rate / self.encoder_downsample_rate
+            cut = math.floor(silence_tokens / 10) * 10
+            if cut > 0:
+                grid = grid[:-cut]
+        return grid
+    @staticmethod
+    def _shift_inputs(input_ids: np.ndarray, pad_token_id: int, max_channels: int) -> np.ndarray:
+        """
+        Convert (T, C) grid to time-shifted multi-channel layout (preserving original implementation logic).
+        Creates a shifted layout where new_len = T + C - 1, with column j shifted backwards by j positions.
+        This enables the model to process multiple codebook channels with temporal alignment.
+        Args:
+            input_ids: Input grid with shape (T, C).
+            pad_token_id: Padding token ID for text tokens.
+            max_channels: Maximum number of channels.
+        Returns:
+            Shifted array with shape (T + max_channels - 1, max_channels).
+        """
+        T, _ = input_ids.shape
+        new_len = T + max_channels - 1
+        shifted = np.full((new_len, max_channels), fill_value=1024, dtype=np.int64)
+        shifted[:, 0] = np.full(new_len, pad_token_id, dtype=np.int64)
+        for j in range(max_channels):
+            shifted[j : (T + j), j] = input_ids[:, j]
+        return shifted
+class MossTTSDProcessor(ProcessorMixin):
+    r"""
+    Constructs a MOSS-TTSD processor which wraps a tokenizer, feature extractor, and audio tokenizer into a single
+    processor. It provides unified text-speech processing capabilities while maintaining backward compatibility with
+    previous API versions.
+    [`MossTTSDProcessor`] offers all the functionalities of [`AutoTokenizer`], [`AutoFeatureExtractor`] and
+    [`XYTokenizer`]. See the [`~MossTTSDProcessor.__call__`] and [`~MossTTSDProcessor.decode`] for more information.
+    Args:
+        tokenizer ([`AutoTokenizer`]):
+            An instance of [`AutoTokenizer`]. The tokenizer is a required input.
+        feature_extractor ([`AutoFeatureExtractor`]):
+            An instance of [`AutoFeatureExtractor`]. The feature extractor is a required input.
+        audio_tokenizer ([`XYTokenizer`]):
+            An instance of [`XYTokenizer`]. The audio tokenizer is a required input.
+        chat_template (`str`, *optional*):
+            A template string for chat formatting when combining text and audio interactions.
+        speech_token_range (`List[int]`, *optional*, defaults to `[151665, 152689]`):
+            Token range [start, end] for mapping speech tokens.
+        audio_bos_token (`str`, *optional*, defaults to `"<|begin_of_speech|>"`):
+            Beginning of speech token string.
+        audio_eos_token (`str`, *optional*, defaults to `"<|end_of_speech|>"`):
+            End of speech token string.
+        audio_pad_token_id (`int`, *optional*, defaults to `1024`):
+            Padding token ID for audio channels.
+    """
+    feature_extractor_class = "AutoFeatureExtractor"
+    tokenizer_class = "AutoTokenizer"
+    audio_tokenizer_class = "PreTrainedModel"
+    def __init__(
+        self,
+        tokenizer,
+        feature_extractor,
+        audio_tokenizer,
+        chat_template: Optional[str] = None,
+        speech_token_range: Optional[list[int]] = None,
+        audio_bos_token: str = "<|begin_of_speech|>",
+        audio_eos_token: str = "<|end_of_speech|>",
+        audio_pad_token_id: int = 1024,
+        **kwargs,
+    ) -> None:
+        super().__init__(tokenizer=tokenizer, feature_extractor=feature_extractor, audio_tokenizer=audio_tokenizer, **kwargs)
+        self.max_channels = (audio_tokenizer.quantizer.num_quantizers if audio_tokenizer else None) or 8
+        self.input_sample_rate = (getattr(audio_tokenizer, "config", None).input_sample_rate if audio_tokenizer else None) or 16000
+        self.output_sample_rate = (getattr(audio_tokenizer, "config", None).output_sample_rate if audio_tokenizer else None) or 16000
+        self.encoder_downsample_rate = (getattr(audio_tokenizer, "config", None).encoder_downsample_rate if audio_tokenizer else None) or 320
+        # Use tokenizer's built-in chat template as primary
+        self.chat_template = getattr(tokenizer, "chat_template", None) or chat_template
+        # Read speech token range from tokenizer with fallback
+        self.speech_token_range = (
+            getattr(tokenizer, "speech_token_range", None) or speech_token_range or [151665, 152689]
+        )
+        self.audio_bos_token = getattr(tokenizer, "audio_bos_token", None) or audio_bos_token
+        self.audio_eos_token = getattr(tokenizer, "audio_eos_token", None) or audio_eos_token
+        self.audio_pad_token_id = getattr(tokenizer, "audio_pad_token_id", None) or audio_pad_token_id
+        # Sample-level processor
+        self.sample_processor = MossTTSDSampleProcessor(
+            tokenizer=self.tokenizer,
+            feature_extractor=self.feature_extractor,
+            audio_tokenizer=self.audio_tokenizer,
+            chat_template=self.chat_template,
+            speech_token_range=self.speech_token_range,
+            audio_bos_token=self.audio_bos_token,
+            audio_eos_token=self.audio_eos_token,
+            audio_pad_token_id=self.audio_pad_token_id,
+            max_channels=self.max_channels,
+            input_sample_rate=self.input_sample_rate,
+            encoder_downsample_rate=self.encoder_downsample_rate,
+        )
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], trust_remote_code=True, **kwargs):
+        """
+        Instantiate a processor from a pretrained model.
+        Args:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                The name of or path to the pretrained model.
+            **kwargs:
+                Additional keyword arguments passed to the respective component loaders.
+        Returns:
+            [`MossTTSDProcessor`]: A new instance of the processor.
+        """
+        kwargs.pop("_from_auto")
+        audio_tokenizer_path = kwargs.pop("codec_path", os.path.join(pretrained_model_name_or_path, "XY_Tokenizer"))
+        assert isinstance(audio_tokenizer_path, str), f"Unsupported audio_tokenizer_path input format: {type(audio_tokenizer_path)}"
+        tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        feature_extractor = AutoFeatureExtractor.from_pretrained(audio_tokenizer_path, trust_remote_code=trust_remote_code, **kwargs)
+        audio_tokenizer = AutoModel.from_pretrained(audio_tokenizer_path, trust_remote_code=trust_remote_code, **kwargs)
+        return cls(
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            audio_tokenizer=audio_tokenizer,
+            **kwargs,
+        )
+    @classmethod
+    def get_processor_dict(
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+    ) -> tuple[dict[str, Any], dict[str, Any]]:
+        proc_dict, rest = super().get_processor_dict(pretrained_model_name_or_path, **kwargs)
+        if "audio_tokenizer" in rest:
+            proc_dict["audio_tokenizer"] = rest.pop("audio_tokenizer")
+        for key in ("speech_token_range", "audio_bos_token", "audio_eos_token", "audio_pad_token_id"):
+            if key in rest:
+                proc_dict[key] = rest.pop(key)
+        return proc_dict, rest
+    def __call__(
+        self,
+        data: Union[dict[str, Any], list[dict[str, Any]]],
+        **kwargs: Unpack[MossTTSDProcessorKwargs],
+    ) -> BatchEncoding:
+        """
+        Main method to prepare inputs for the model from structured data.
+        This method forwards the `data` and `kwargs` arguments to prepare inputs for MOSS-TTSD model. Please refer to the
+        docstring of the respective methods for more information.
+        Args:
+            data (`dict` or `list[dict]`):
+                Single dictionary or list of dictionaries containing input data. Expected keys include 'text',
+                'prompt_text', 'prompt_audio', etc.
+            **kwargs (`MossTTSDProcessorKwargs`):
+                Additional processing arguments.
+        Returns:
+            [`BatchEncoding`]: Processed inputs ready for model consumption.
+        """
+        if isinstance(data, dict):
+            data = [data]
+        out_kwargs = self._merge_kwargs(MossTTSDProcessorKwargs, **kwargs)
+        text_kwargs = out_kwargs["text_kwargs"]
+        audio_kwargs = out_kwargs["audio_kwargs"]
+        common_kwargs = out_kwargs["common_kwargs"]
+        return_tensors = common_kwargs.get("return_tensors", "pt")
+        padding = common_kwargs.get("padding", True)
+        use_normalize = common_kwargs.get("use_normalize", False)
+        pad_token_id = int(text_kwargs.get("pad_token_id", self.tokenizer.pad_token_id or 0))
+        max_channels = int(audio_kwargs.get("max_channels", self.max_channels))
+        audio_pad_token_id = int(audio_kwargs.get("audio_pad_token_id", self.audio_pad_token_id))
+        silence_duration = float(audio_kwargs.get("silence_duration", 0.0))
+        def _apply_chat_template(text: str, extra: dict) -> str:
+            return self.apply_chat_template(conversation=None, text=text, **extra)
+        samples: list[MossTTSDChatSample] = []
+        for item in data:
+            sample = self.sample_processor.prepare_sample(
+                item,
+                apply_chat_template=_apply_chat_template,
+                use_normalize=use_normalize,
+                silence_duration=silence_duration,
+            )
+            # Override with call-time max_channels (may differ from component initialization)
+            if sample.input_ids_2d.shape[1] != max_channels:
+                # Simplified: for clipping/extending channels, only pad/clip on the right side
+                T, C = sample.input_ids_2d.shape
+                if C > max_channels:
+                    sample.input_ids_2d = sample.input_ids_2d[:, :max_channels]
+                else:
+                    pad = torch.full((T, max_channels - C), audio_pad_token_id, dtype=torch.long)
+                    sample.input_ids_2d = torch.cat([sample.input_ids_2d, pad], dim=1)
+            samples.append(sample)
+        if not padding:
+            raise NotImplementedError("Unpadded batches are not supported yet.")
+        batch = self.sample_processor.collate(
+            samples,
+            pad_token_id=pad_token_id,
+            audio_pad_token_id=audio_pad_token_id,
+        )
+        # Align with HiggsAudioProcessor: explicit dict -> BatchEncoding/Feature
+        inputs = asdict(batch)
+        inputs = {k: v for k, v in inputs.items() if v is not None}
+        return BatchEncoding(inputs, tensor_type=return_tensors)
+    def shifting_outputs(
+        self,
+        output_ids: "torch.Tensor",
+        speech_token_range: list[int],
+        max_channels: int = 8,
+    ) -> "torch.Tensor":
+        """
+        Restore time-shifted layout to per-timestep C-channel arrangement and reverse-offset first codebook.
+        Converts the time-shifted multi-channel output back to standard (batch, time, channels) format
+        and maps the first codebook tokens back to their original space by subtracting the speech token offset.
+        Args:
+            output_ids: Time-shifted output tensor.
+            speech_token_range: Speech token range for reverse mapping.
+            max_channels: Number of codebook channels.
+        Returns:
+            Restored tensor with shape (batch, seq_len, max_channels).
+        """
+        seq_len = output_ids.shape[1] - max_channels + 1
+        speech_ids = torch.full((output_ids.shape[0], seq_len, max_channels), 0, dtype=output_ids.dtype, device=output_ids.device)
+        for j in range(max_channels):
+            speech_ids[..., j] = output_ids[:, j : seq_len + j, j]
+            if j == 0:
+                speech_ids[..., j] = speech_ids[..., j] - speech_token_range[0]
+        return speech_ids
+    def _find_max_valid_positions(self, data: "torch.Tensor", invalid_value: int = 1024):
+        """
+        Locate continuous valid audio segment intervals in each sequence (all non-text channels valid simultaneously).
+        Identifies contiguous spans where all audio channels (columns 1+) contain valid tokens
+        (not the invalid_value padding token).
+        Args:
+            data: Input tensor with shape (batch, time, channels).
+            invalid_value: Token ID considered as invalid/padding.
+        Returns:
+            List of lists containing valid audio segments for each sequence in the batch.
+        """
+        mask = torch.all(data[:, :, 1:] != invalid_value, dim=2)
+        valid_indices = torch.where(mask)
+        result = [[] for _ in range(len(data))]
+        if valid_indices[0].numel() == 0:
+            return result
+        grouped = []
+        group_ids = []
+        for i, seq_no in enumerate(valid_indices[0]):
+            pos = valid_indices[1][i]
+            if not group_ids or seq_no > group_ids[-1]:
+                group_ids.append(seq_no)
+                grouped.append([[pos, pos + 1]])
+            elif pos == grouped[-1][-1][-1]:
+                grouped[-1][-1][-1] += 1
+            else:
+                grouped[-1].append([pos, pos + 1])
+        for gid, spans in zip(group_ids, grouped):
+            for s, e in spans:
+                result[gid].append(data[gid, s:e, :])
+        return result
+    def batch_decode(self, token_ids: "torch.Tensor", *args, **kwargs):
+        """
+        Decode a batch of token sequences into text and audio outputs.
+        This method forwards the `token_ids` and `kwargs` arguments to decode text and audio outputs from the model.
+        Please refer to the docstring of the respective methods for more information.
+        Args:
+            token_ids (`torch.Tensor`):
+                Token tensor with shape (batch, time, channels).
+            *args:
+                Additional arguments passed to tokenizer.batch_decode.
+            **kwargs:
+                Additional keyword arguments passed to tokenizer.batch_decode.
+        Returns:
+            `tuple`: Tuple of (text_list, audio_list) where text_list contains decoded text strings and audio_list
+                contains decoded audio arrays for each sequence.
+        """
+        assert token_ids.ndim == 3 and token_ids.shape[2] == self.max_channels
+        text = self.tokenizer.batch_decode(token_ids[:, :, 0], *args, **kwargs)
+        normal = self.shifting_outputs(token_ids, self.speech_token_range, self.max_channels)
+        audio_frags = self._find_max_valid_positions(normal, self.audio_pad_token_id)
+        decode_audio = []
+        for seq_frags in audio_frags:
+            if len(seq_frags):
+                frag = torch.cat([f.permute(1, 0).unsqueeze(1) for f in seq_frags], dim=1)
+                decode_audio.append(self.audio_tokenizer.decode(frag, overlap_seconds=10)["audio_values"])
+            else:
+                decode_audio.append([])
+        return text, decode_audio
+    def decode(self, token_ids: "torch.Tensor", *args, **kwargs) -> MossTTSDResponse:
+        """
+        Decode a single sequence of token IDs into text and audio.
+        This method forwards the `token_ids` and `kwargs` arguments to decode a single sequence. Please refer to the
+        docstring of the respective methods for more information.
+        Args:
+            token_ids (`torch.Tensor`):
+                Token tensor with shape (time, channels).
+            *args:
+                Additional arguments passed to tokenizer.decode.
+            **kwargs:
+                Additional keyword arguments passed to tokenizer.decode.
+        Returns:
+            [`MossTTSDResponse`]: Response object containing generated text, audio, and sampling rate.
+        """
+        assert token_ids.ndim == 2 and token_ids.shape[1] == self.max_channels
+        text = self.tokenizer.decode(token_ids[:, 0].squeeze(-1), *args, **kwargs)
+        normal = self.shifting_outputs(token_ids.unsqueeze(0), self.speech_token_range, self.max_channels)
+        audio_frags = self._find_max_valid_positions(normal, self.audio_pad_token_id)[0]
+        if len(audio_frags):
+            frag = torch.cat([f.permute(1, 0).unsqueeze(1) for f in audio_frags], dim=1)
+            audio = self.audio_tokenizer.decode(frag, overlap_seconds=10)["audio_values"]
+        else:
+            audio = None
+        return MossTTSDResponse(
+            audio=None if audio is None else audio.detach().cpu().numpy(),
+            generated_text=text,
+            sampling_rate=self.output_sample_rate,
+        )
+    def save_audio(self, audios, output_dir="output", prefix="audio"):
+        """
+        Save multiple audio fragments to files.
+        Args:
+            audios: List of audio data fragments from batch_decode
+            output_dir (str): Directory to save audio files
+            prefix (str): Prefix for audio filenames
+        """
+        if not is_torchaudio_available():
+            raise ImportError("Please install `torchaudio` to save audio files.")
+        os.makedirs(output_dir, exist_ok=True)
+        for i, data in enumerate(audios):
+            for j, fragment in enumerate(data):
+                filename = f"{output_dir}/{prefix}_{i}_{j}.wav"
+                torchaudio.save(filename, fragment.cpu(), self.output_sample_rate)
+__all__ = ["MossTTSDProcessor"]

processor_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "processor_class": "processing_moss_ttsd.MossTTSDProcessor",
+  "auto_map": {
+    "AutoProcessor": "processing_moss_ttsd.MossTTSDProcessor"
+  }
+}

tokenizer_config.json CHANGED Viewed

@@ -8451,12 +8451,20 @@
     "<|video_pad|>"
   ],
   "bos_token": null,
-  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
   "clean_up_tokenization_spaces": false,
   "eos_token": "<|endoftext|>",
   "errors": "replace",
   "extra_special_tokens": {},
-  "model_max_length": 131072,
   "pad_token": "<|endoftext|>",
   "padding_side": "right",
   "split_special_tokens": false,

     "<|video_pad|>"
   ],
   "bos_token": null,
+  "chat_template": "<|begin_of_style|>{{ system_prompt | default('You are a speech synthesizer that generates natural, realistic, and human-like conversational audio from dialogue text.') }}<|end_of_style|>\n<|begin_of_text|>{{ text }}<|end_of_text|>\n<|begin_of_speech|>",
   "clean_up_tokenization_spaces": false,
   "eos_token": "<|endoftext|>",
   "errors": "replace",
   "extra_special_tokens": {},
+  "model_max_length": 16384,
+  "processor_class": "processing_moss_ttsd.MossTTSDProcessor",
+  "speech_token_range": [
+    151665,
+    152689
+  ],
+  "audio_bos_token": "<|begin_of_speech|>",
+  "audio_eos_token": "<|end_of_speech|>",
+  "audio_pad_token_id": 1024,
   "pad_token": "<|endoftext|>",
   "padding_side": "right",
   "split_special_tokens": false,