bytedance-research
/

ChatTS-14B

@@ -33,6 +33,7 @@ def sp_encoding(timeseries: np.ndarray, eots_token: bool = True) -> Tuple[np.nda
         prompt (str): The placeholder string with offset and scaling info.
         metadata (dict): Metadata containing the offset and scaling factor.
     """
     mean = np.mean(timeseries)
     scaled_timeseries = timeseries - mean
     scale_factor = 1.0
@@ -57,12 +58,12 @@ class Qwen2TSProcessor(ProcessorMixin):
     feature_extractor_class = None  # You can add a feature extractor if needed
     tokenizer_class = "AutoTokenizer"
-    def __init__(self, tokenizer=None):
         """
         Args:
             tokenizer: An optional tokenizer to process text prompts.
         """
-        super().__init__(tokenizer=tokenizer)
     def __call__(
         self,
@@ -71,6 +72,7 @@ class Qwen2TSProcessor(ProcessorMixin):
         padding: Union[bool, str, PaddingStrategy] = False,
         padding_side: str = 'left',
         vllm_flag: bool = False,
         **kwargs,
     ) -> BatchFeature:
         """
@@ -150,8 +152,10 @@ class Qwen2TSProcessor(ProcessorMixin):
         # Tokenize the processed prompt
         tokenizer_outputs = {}
-        if self.tokenizer is not None:
             tokenizer_outputs = self.tokenizer(reconstructed_prompts, padding=padding, padding_side=padding_side, **kwargs)
         # Create the final output
         outputs = tokenizer_outputs
@@ -162,6 +166,35 @@ class Qwen2TSProcessor(ProcessorMixin):
         return BatchFeature(data=outputs)
     @property
     def model_input_names(self):
         """
@@ -184,4 +217,4 @@ class Qwen2TSProcessor(ProcessorMixin):
         This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
         the docstring of this method for more information.
         """
-        return self.tokenizer.decode(*args, **kwargs)

         prompt (str): The placeholder string with offset and scaling info.
         metadata (dict): Metadata containing the offset and scaling factor.
     """
+    timeseries = np.array(timeseries)
     mean = np.mean(timeseries)
     scaled_timeseries = timeseries - mean
     scale_factor = 1.0
     feature_extractor_class = None  # You can add a feature extractor if needed
     tokenizer_class = "AutoTokenizer"
+    def __init__(self, tokenizer=None, **kwargs):
         """
         Args:
             tokenizer: An optional tokenizer to process text prompts.
         """
+        super().__init__(tokenizer=tokenizer, **kwargs)
     def __call__(
         self,
         padding: Union[bool, str, PaddingStrategy] = False,
         padding_side: str = 'left',
         vllm_flag: bool = False,
+        tokenize: bool = True,
         **kwargs,
     ) -> BatchFeature:
         """
         # Tokenize the processed prompt
         tokenizer_outputs = {}
+        if tokenize and self.tokenizer is not None:
             tokenizer_outputs = self.tokenizer(reconstructed_prompts, padding=padding, padding_side=padding_side, **kwargs)
+        else:
+            tokenizer_outputs = {"text": reconstructed_prompts}
         # Create the final output
         outputs = tokenizer_outputs
         return BatchFeature(data=outputs)
+    def encode_timeseries(
+        self,
+        timeseries: Optional[List[List[np.ndarray]]] = None,
+    ) -> np.ndarray:
+        if timeseries is None:
+            timeseries = []
+        concatenated_ts = None
+        encoded_ts_arrays = []
+        for i, ts in enumerate(timeseries):
+            encoded_ts, _, _ = sp_encoding(ts)
+            # Ensure time series shape [1, seq_len, feature_dim] for batch concatenation
+            encoded_ts_arrays.append(encoded_ts[None, ...])
+        if len(encoded_ts_arrays) > 0:
+            # Pad time series to the same length
+            max_length = max(ts.shape[1] for ts in encoded_ts_arrays)
+            padded_ts_arrays = [
+                np.pad(ts, ((0, 0), (0, max_length - ts.shape[1]), (0, 0)), mode="constant", constant_values=0.0)
+                for ts in encoded_ts_arrays
+            ]
+            concatenated_ts = np.concatenate(padded_ts_arrays, axis=0)  # Shape: [batch_size, max_length, feature_dim]
+            # Convert to torch
+            concatenated_ts = torch.from_numpy(concatenated_ts).half()
+        return concatenated_ts
     @property
     def model_input_names(self):
         """
         This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
         the docstring of this method for more information.
         """
+        return self.tokenizer.decode(*args, **kwargs)