xiezhe24 commited on
Commit
fea24f2
·
verified ·
1 Parent(s): 1e66110

Update processing_qwen2_ts.py

Browse files
Files changed (1) hide show
  1. processing_qwen2_ts.py +37 -4
processing_qwen2_ts.py CHANGED
@@ -33,6 +33,7 @@ def sp_encoding(timeseries: np.ndarray, eots_token: bool = True) -> Tuple[np.nda
33
  prompt (str): The placeholder string with offset and scaling info.
34
  metadata (dict): Metadata containing the offset and scaling factor.
35
  """
 
36
  mean = np.mean(timeseries)
37
  scaled_timeseries = timeseries - mean
38
  scale_factor = 1.0
@@ -57,12 +58,12 @@ class Qwen2TSProcessor(ProcessorMixin):
57
  feature_extractor_class = None # You can add a feature extractor if needed
58
  tokenizer_class = "AutoTokenizer"
59
 
60
- def __init__(self, tokenizer=None):
61
  """
62
  Args:
63
  tokenizer: An optional tokenizer to process text prompts.
64
  """
65
- super().__init__(tokenizer=tokenizer)
66
 
67
  def __call__(
68
  self,
@@ -71,6 +72,7 @@ class Qwen2TSProcessor(ProcessorMixin):
71
  padding: Union[bool, str, PaddingStrategy] = False,
72
  padding_side: str = 'left',
73
  vllm_flag: bool = False,
 
74
  **kwargs,
75
  ) -> BatchFeature:
76
  """
@@ -150,8 +152,10 @@ class Qwen2TSProcessor(ProcessorMixin):
150
 
151
  # Tokenize the processed prompt
152
  tokenizer_outputs = {}
153
- if self.tokenizer is not None:
154
  tokenizer_outputs = self.tokenizer(reconstructed_prompts, padding=padding, padding_side=padding_side, **kwargs)
 
 
155
 
156
  # Create the final output
157
  outputs = tokenizer_outputs
@@ -162,6 +166,35 @@ class Qwen2TSProcessor(ProcessorMixin):
162
 
163
  return BatchFeature(data=outputs)
164
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  @property
166
  def model_input_names(self):
167
  """
@@ -184,4 +217,4 @@ class Qwen2TSProcessor(ProcessorMixin):
184
  This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
185
  the docstring of this method for more information.
186
  """
187
- return self.tokenizer.decode(*args, **kwargs)
 
33
  prompt (str): The placeholder string with offset and scaling info.
34
  metadata (dict): Metadata containing the offset and scaling factor.
35
  """
36
+ timeseries = np.array(timeseries)
37
  mean = np.mean(timeseries)
38
  scaled_timeseries = timeseries - mean
39
  scale_factor = 1.0
 
58
  feature_extractor_class = None # You can add a feature extractor if needed
59
  tokenizer_class = "AutoTokenizer"
60
 
61
+ def __init__(self, tokenizer=None, **kwargs):
62
  """
63
  Args:
64
  tokenizer: An optional tokenizer to process text prompts.
65
  """
66
+ super().__init__(tokenizer=tokenizer, **kwargs)
67
 
68
  def __call__(
69
  self,
 
72
  padding: Union[bool, str, PaddingStrategy] = False,
73
  padding_side: str = 'left',
74
  vllm_flag: bool = False,
75
+ tokenize: bool = True,
76
  **kwargs,
77
  ) -> BatchFeature:
78
  """
 
152
 
153
  # Tokenize the processed prompt
154
  tokenizer_outputs = {}
155
+ if tokenize and self.tokenizer is not None:
156
  tokenizer_outputs = self.tokenizer(reconstructed_prompts, padding=padding, padding_side=padding_side, **kwargs)
157
+ else:
158
+ tokenizer_outputs = {"text": reconstructed_prompts}
159
 
160
  # Create the final output
161
  outputs = tokenizer_outputs
 
166
 
167
  return BatchFeature(data=outputs)
168
 
169
+ def encode_timeseries(
170
+ self,
171
+ timeseries: Optional[List[List[np.ndarray]]] = None,
172
+ ) -> np.ndarray:
173
+ if timeseries is None:
174
+ timeseries = []
175
+
176
+ concatenated_ts = None
177
+ encoded_ts_arrays = []
178
+
179
+ for i, ts in enumerate(timeseries):
180
+ encoded_ts, _, _ = sp_encoding(ts)
181
+ # Ensure time series shape [1, seq_len, feature_dim] for batch concatenation
182
+ encoded_ts_arrays.append(encoded_ts[None, ...])
183
+
184
+ if len(encoded_ts_arrays) > 0:
185
+ # Pad time series to the same length
186
+ max_length = max(ts.shape[1] for ts in encoded_ts_arrays)
187
+ padded_ts_arrays = [
188
+ np.pad(ts, ((0, 0), (0, max_length - ts.shape[1]), (0, 0)), mode="constant", constant_values=0.0)
189
+ for ts in encoded_ts_arrays
190
+ ]
191
+ concatenated_ts = np.concatenate(padded_ts_arrays, axis=0) # Shape: [batch_size, max_length, feature_dim]
192
+
193
+ # Convert to torch
194
+ concatenated_ts = torch.from_numpy(concatenated_ts).half()
195
+
196
+ return concatenated_ts
197
+
198
  @property
199
  def model_input_names(self):
200
  """
 
217
  This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
218
  the docstring of this method for more information.
219
  """
220
+ return self.tokenizer.decode(*args, **kwargs)