DragonLineageAI
/

Vi-SparkTTS-0.5B

@@ -24,6 +24,7 @@ from typing import Dict, Any, Tuple, Optional, Union
 from transformers import PreTrainedModel, AutoModelForCausalLM, Wav2Vec2FeatureExtractor, Wav2Vec2Model
 from transformers.utils import logging, requires_backends, cached_file
 from transformers.generation.utils import GenerationMixin
 from transformers.configuration_utils import PretrainedConfig
 from safetensors.torch import load_file
@@ -3039,7 +3040,7 @@ class SparkTTSModel(PreTrainedModel, GenerationMixin):
     """
     config_class = SparkTTSConfig
     base_model_prefix = "spark_tts"
-    _supports_load_fast = False
     def __init__(self, config: SparkTTSConfig, llm=None, wav2vec2_model=None, wav2vec2_processor=None, bicodec=None):
         super().__init__(config)
@@ -3049,9 +3050,8 @@ class SparkTTSModel(PreTrainedModel, GenerationMixin):
         self.wav2vec2_processor = wav2vec2_processor
         self.bicodec = bicodec
-        # Wav2Vec2 specific config adjustment (needs to happen after loading)
-        if self.wav2vec2_model and hasattr(self.wav2vec2_model.config, 'output_hidden_states'):
-            self.wav2vec2_model.config.output_hidden_states = True
     @classmethod
@@ -3066,124 +3066,182 @@ class SparkTTSModel(PreTrainedModel, GenerationMixin):
         local_files_only: bool = False,
         token: Optional[Union[str, bool]] = None,
         revision: str = "main",
-        use_safetensors: bool = None,
         **kwargs,
     ):
-        # 1. Load Config
-        if config is None:
             config, model_kwargs = cls.config_class.from_pretrained(
-                pretrained_model_name_or_path,
-                *model_args,
-                cache_dir=cache_dir,
                 force_download=force_download,
                 local_files_only=local_files_only,
                 token=token,
                 revision=revision,
                 return_unused_kwargs=True,
-                **kwargs,
             )
         else:
-             model_kwargs = kwargs
-        # Pop device map info - will handle placement later
-        device_map = model_kwargs.pop("device_map", None)
-        torch_dtype = model_kwargs.pop("torch_dtype", "auto") # Use config's or auto
-        # Check for trust_remote_code - needed for config loading if custom code involved there too
-        trust_remote_code = model_kwargs.pop("trust_remote_code", False) # Important
-        # NEW IMPROVED PATH RESOLUTION
-        from huggingface_hub import snapshot_download
-        import os
-        # Check if it's a local path first
-        if os.path.isdir(pretrained_model_name_or_path):
-            resolved_model_path = Path(pretrained_model_name_or_path)
-        else:
-            # Try to get from Hugging Face Hub
-            try:
-                logger.info(f"Downloading/locating model from Hugging Face Hub: {pretrained_model_name_or_path}")
-                # This will download the model if needed and return the cached path
-                resolved_model_path = Path(snapshot_download(
-                    pretrained_model_name_or_path,
-                    revision=revision,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    local_files_only=local_files_only,
-                    token=token,
-                ))
-            except Exception as e:
-                logger.error(f"Error downloading model: {e}")
-                raise EnvironmentError(f"Failed to find or download model '{pretrained_model_name_or_path}': {e}")
-        if not resolved_model_path.is_dir():
-            raise EnvironmentError(f"Cannot find model directory at {resolved_model_path}")
-        # Helper function to resolve paths relative to the main model directory
-        def _resolve_path(sub_path):
             p = Path(sub_path)
             if p.is_absolute():
                 return str(p)
             else:
-                 # Resolve relative to the potentially cached main model path
-                 return str(resolved_model_path / p)
         # --- Load LLM ---
-        llm_path = _resolve_path(config.llm_model_name_or_path)
         logger.info(f"Loading LLM from resolved path: {llm_path}")
         try:
             llm = AutoModelForCausalLM.from_pretrained(
                 llm_path,
-                torch_dtype=torch_dtype if torch_dtype != "auto" else config.torch_dtype, # Prioritize explicit dtype
                 trust_remote_code=trust_remote_code, # Pass down trust_remote_code
-                **model_kwargs # Pass remaining kwargs
             )
         except Exception as e:
             raise OSError(f"Failed to load LLM from {llm_path}: {e}")
         # --- Load Wav2Vec2 ---
-        w2v_path = _resolve_path(config.wav2vec2_model_name_or_path)
-        logger.info(f"Loading Wav2Vec2 from resolved path: {w2v_path}")
         try:
-            wav2vec2_processor = Wav2Vec2FeatureExtractor.from_pretrained(w2v_path, trust_remote_code=trust_remote_code)
-            wav2vec2_model = Wav2Vec2Model.from_pretrained(w2v_path, trust_remote_code=trust_remote_code)
         except Exception as e:
             raise OSError(f"Failed to load Wav2Vec2 components from {w2v_path}: {e}")
         # --- Load BiCodec ---
-        bicodec_path = _resolve_path(config.bicodec_model_name_or_path)
         logger.info(f"Loading BiCodec from resolved path: {bicodec_path}")
-        # print(f"Loading BiCodec from resolved path: {bicodec_path}, {config}")
         if not config.bicodec_config or "audio_tokenizer" not in config.bicodec_config:
              raise ValueError("BiCodec configuration ('bicodec_config' with 'audio_tokenizer' key) not found in SparkTTSConfig.")
         try:
-            # Assuming BiCodec class is defined above in this file
             bicodec = BiCodec.load_from_config_and_checkpoint(
                  model_dir=Path(bicodec_path),
                  config_dict=config.bicodec_config["audio_tokenizer"]
             )
         except Exception as e:
-             raise OSError(f"Failed to load BiCodec from {bicodec_path}: {e}")
-        # Instantiate the main model wrapper, passing the loaded components
         model = cls(config, llm=llm, wav2vec2_model=wav2vec2_model, wav2vec2_processor=wav2vec2_processor, bicodec=bicodec)
-        # --- Handle device placement ---
-        # Note: device_map is complex; simple .to(device) is easier if not using accelerate
-        # Determine target device
         if torch.cuda.is_available():
-             current_device = torch.cuda.current_device()
-             device = torch.device(f"cuda:{current_device}")
         else:
-             device = torch.device("cpu")
-        logger.info(f"Placing SparkTTSModel and components on device: {device}")
-        model.to(device) # This should move all registered nn.Module attributes
         return model
     # --- Embedding getters/setters (delegate to LLM if loaded) ---
     def get_input_embeddings(self):
         if self.llm:
@@ -3212,11 +3270,13 @@ class SparkTTSModel(PreTrainedModel, GenerationMixin):
     # post_init is less critical now as loading happens in from_pretrained,
     # but can be used for final checks or setup.
     def post_init(self):
-        # Ensure wav2vec2 config has output_hidden_states=True
         if self.wav2vec2_model and hasattr(self.wav2vec2_model.config, 'output_hidden_states'):
-             if not self.wav2vec2_model.config.output_hidden_states:
-                  self.wav2vec2_model.config.output_hidden_states = True
-                  logger.info("Set wav2vec2_model.config.output_hidden_states=True")
     @property
     def device(self) -> torch.device:

 from transformers import PreTrainedModel, AutoModelForCausalLM, Wav2Vec2FeatureExtractor, Wav2Vec2Model
 from transformers.utils import logging, requires_backends, cached_file
+from huggingface_hub import snapshot_download
 from transformers.generation.utils import GenerationMixin
 from transformers.configuration_utils import PretrainedConfig
 from safetensors.torch import load_file
     """
     config_class = SparkTTSConfig
     base_model_prefix = "spark_tts"
+    _supports_load_fast = True
     def __init__(self, config: SparkTTSConfig, llm=None, wav2vec2_model=None, wav2vec2_processor=None, bicodec=None):
         super().__init__(config)
         self.wav2vec2_processor = wav2vec2_processor
         self.bicodec = bicodec
+        # Ensure wav2vec2 config has output_hidden_states=True after loading
+        self.post_init()
     @classmethod
         local_files_only: bool = False,
         token: Optional[Union[str, bool]] = None,
         revision: str = "main",
+        use_safetensors: Optional[bool] = None, # Keep None to let transformers decide
         **kwargs,
     ):
+        # Pop device map and dtype early - handle placement later
+        # Note: device_map is complex with multiple components. Manual .to(device) is simpler here.
+        device_map = kwargs.pop("device_map", None)
+        if device_map:
+            logger.warning("`device_map` is not directly supported for this composite model. Use .to(device) after loading.")
+        torch_dtype = kwargs.pop("torch_dtype", "auto") # Can be "auto", float32, float16, bfloat16
+        trust_remote_code = kwargs.pop("trust_remote_code", False) # CRITICAL for custom code
+        # --- 1. Resolve the main model directory ---
+        # This handles downloading from Hub or using a local path robustly.
+        if pretrained_model_name_or_path is None:
+            raise ValueError("`pretrained_model_name_or_path` must be provided.")
+        model_path = Path(pretrained_model_name_or_path)
+        if not model_path.is_dir():
+            # If it's not a local directory, assume it's a Hub ID and download everything
+            logger.info(f"{pretrained_model_name_or_path} is not a local directory. Assuming Hub ID and downloading.")
+            try:
+                resolved_model_path = snapshot_download(
+                    repo_id=str(pretrained_model_name_or_path),
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    allow_patterns=["*.json", "*.safetensors", "*.bin", "*.yaml", "*.txt", "README.md"], # Be somewhat permissive
+                    # ignore_patterns=["*.git*"], # Optional: ignore git files
+                    # user_agent={"agent": "spark-tts-custom-loader"}, # Optional
+                )
+                resolved_model_path = Path(resolved_model_path)
+                logger.info(f"Model downloaded to cache: {resolved_model_path}")
+            except Exception as e:
+                raise OSError(
+                    f"Failed to download model '{pretrained_model_name_or_path}' from Hugging Face Hub. "
+                    f"Ensure the ID is correct and network is available. Error: {e}"
+                )
+        else:
+            # It's a local directory path
+            resolved_model_path = model_path
+            logger.info(f"Loading model from local directory: {resolved_model_path}")
+        if not resolved_model_path.is_dir():
+             # This should ideally not happen after snapshot_download or initial check
+             raise EnvironmentError(f"Cannot find resolved model directory at {resolved_model_path}")
+        # --- 2. Load the main configuration ---
+        # The config might have been passed explicitly, otherwise load from resolved path
+        if not isinstance(config, PretrainedConfig):
+            config_path = config if config is not None else resolved_model_path
             config, model_kwargs = cls.config_class.from_pretrained(
+                config_path, # Load from the resolved directory or explicit config path
+                *model_args, # Pass *model_args here if they influence config loading
+                cache_dir=cache_dir, # Pass relevant args down
                 force_download=force_download,
                 local_files_only=local_files_only,
                 token=token,
                 revision=revision,
+                trust_remote_code=trust_remote_code, # Needed if config class itself is remote
                 return_unused_kwargs=True,
+                **kwargs, # Pass remaining kwargs
             )
+            # Update kwargs with unused ones from config loading
+            kwargs.update(model_kwargs)
         else:
+             # Config object was passed directly
+             pass # kwargs remain as they were
+        # --- Determine torch_dtype (use config value if specified and not overridden) ---
+        # Priority: Explicit torch_dtype arg > config.torch_dtype > "auto" (default)
+        final_torch_dtype = torch_dtype # Explicit arg has highest prio
+        if final_torch_dtype == "auto":
+            final_torch_dtype = getattr(config, "torch_dtype", None) # Use config value if present
+        # final_torch_dtype can still be None or "auto" here, handle downstream
+        # --- Helper function to resolve paths relative to the main model directory ---
+        def _resolve_sub_path(sub_path):
             p = Path(sub_path)
             if p.is_absolute():
                 return str(p)
             else:
+                # Resolve relative to the potentially cached main model path
+                return str(resolved_model_path / p)
+        # --- 3. Load Sub-components ---
         # --- Load LLM ---
+        llm_path = _resolve_sub_path(config.llm_model_name_or_path)
         logger.info(f"Loading LLM from resolved path: {llm_path}")
         try:
             llm = AutoModelForCausalLM.from_pretrained(
                 llm_path,
+                torch_dtype=final_torch_dtype if final_torch_dtype != "auto" else None, # Pass resolved dtype or None
                 trust_remote_code=trust_remote_code, # Pass down trust_remote_code
+                # Pass remaining kwargs that might be relevant for AutoModelForCausalLM
+                # Filter kwargs if necessary, but often passing them is fine
+                **kwargs
             )
         except Exception as e:
             raise OSError(f"Failed to load LLM from {llm_path}: {e}")
         # --- Load Wav2Vec2 ---
+        w2v_path = _resolve_sub_path(config.wav2vec2_model_name_or_path)
+        logger.info(f"Loading Wav2Vec2 components from resolved path: {w2v_path}")
         try:
+            # Load feature extractor first
+            wav2vec2_processor = Wav2Vec2FeatureExtractor.from_pretrained(
+                w2v_path,
+                trust_remote_code=trust_remote_code,
+                # Add any relevant kwargs for feature extractor if needed
+            )
+            # Load model
+            wav2vec2_model = Wav2Vec2Model.from_pretrained(
+                w2v_path,
+                trust_remote_code=trust_remote_code,
+                # Add any relevant kwargs for model if needed (e.g., add_adapter=False)
+            )
         except Exception as e:
             raise OSError(f"Failed to load Wav2Vec2 components from {w2v_path}: {e}")
         # --- Load BiCodec ---
+        bicodec_path = _resolve_sub_path(config.bicodec_model_name_or_path)
         logger.info(f"Loading BiCodec from resolved path: {bicodec_path}")
         if not config.bicodec_config or "audio_tokenizer" not in config.bicodec_config:
              raise ValueError("BiCodec configuration ('bicodec_config' with 'audio_tokenizer' key) not found in SparkTTSConfig.")
         try:
+            # Assuming BiCodec class has the custom loading method
+            # Make sure BiCodec class is imported or defined above
             bicodec = BiCodec.load_from_config_and_checkpoint(
                  model_dir=Path(bicodec_path),
                  config_dict=config.bicodec_config["audio_tokenizer"]
             )
+             # Ensure BiCodec is an nn.Module if you want .to(device) to work easily
+            if not isinstance(bicodec, torch.nn.Module):
+                 logger.warning("Loaded BiCodec component is not an instance of torch.nn.Module. Automatic device placement might not work.")
+        except FileNotFoundError as e:
+             raise OSError(f"Failed to load BiCodec: A required file was not found in {bicodec_path}. Original error: {e}")
         except Exception as e:
+             logger.error(f"Raw error loading BiCodec: {type(e).__name__}: {e}")
+             import traceback
+             traceback.print_exc() # Print full traceback for debugging BiCodec loading
+             raise OSError(f"Failed to load BiCodec from {bicodec_path}. Check BiCodec implementation and file paths. Error: {e}")
+        # --- 4. Instantiate the main model wrapper ---
+        # Pass the loaded config and components
         model = cls(config, llm=llm, wav2vec2_model=wav2vec2_model, wav2vec2_processor=wav2vec2_processor, bicodec=bicodec)
+        # --- 5. Handle device placement ---
+        # Move the entire model (including sub-modules if they are nn.Module attributes)
+        # Determine target device based on availability
         if torch.cuda.is_available():
+             final_device = torch.device("cuda")
+             # If multiple GPUs, could select one, e.g., torch.device("cuda:0")
+             # Or rely on CUDA_VISIBLE_DEVICES environment variable
         else:
+             final_device = torch.device("cpu")
+        logger.info(f"Placing SparkTTSModel and components on device: {final_device}")
+        # This should move all registered nn.Module attributes (llm, wav2vec2_model, bicodec if it's an nn.Module)
+        try:
+             model.to(final_device)
+        except Exception as e:
+             logger.error(f"Failed to move model to device {final_device}. Error: {e}")
+             logger.warning("Device placement might be incomplete. Check component types.")
+        # --- 6. Return the loaded and prepared model ---
         return model
     # --- Embedding getters/setters (delegate to LLM if loaded) ---
     def get_input_embeddings(self):
         if self.llm:
     # post_init is less critical now as loading happens in from_pretrained,
     # but can be used for final checks or setup.
     def post_init(self):
+        # Ensure wav2vec2 config has output_hidden_states=True after loading
         if self.wav2vec2_model and hasattr(self.wav2vec2_model.config, 'output_hidden_states'):
+            if not self.wav2vec2_model.config.output_hidden_states:
+                self.wav2vec2_model.config.output_hidden_states = True
+                logger.info("Set wav2vec2_model.config.output_hidden_states=True")
+        else:
+            logger.warning("Could not access wav2vec2_model.config to ensure output_hidden_states=True.")
     @property
     def device(self) -> torch.device: