Update processing_qwen2_ts.py
Browse files- processing_qwen2_ts.py +37 -4
processing_qwen2_ts.py
CHANGED
@@ -33,6 +33,7 @@ def sp_encoding(timeseries: np.ndarray, eots_token: bool = True) -> Tuple[np.nda
|
|
33 |
prompt (str): The placeholder string with offset and scaling info.
|
34 |
metadata (dict): Metadata containing the offset and scaling factor.
|
35 |
"""
|
|
|
36 |
mean = np.mean(timeseries)
|
37 |
scaled_timeseries = timeseries - mean
|
38 |
scale_factor = 1.0
|
@@ -57,12 +58,12 @@ class Qwen2TSProcessor(ProcessorMixin):
|
|
57 |
feature_extractor_class = None # You can add a feature extractor if needed
|
58 |
tokenizer_class = "AutoTokenizer"
|
59 |
|
60 |
-
def __init__(self, tokenizer=None):
|
61 |
"""
|
62 |
Args:
|
63 |
tokenizer: An optional tokenizer to process text prompts.
|
64 |
"""
|
65 |
-
super().__init__(tokenizer=tokenizer)
|
66 |
|
67 |
def __call__(
|
68 |
self,
|
@@ -71,6 +72,7 @@ class Qwen2TSProcessor(ProcessorMixin):
|
|
71 |
padding: Union[bool, str, PaddingStrategy] = False,
|
72 |
padding_side: str = 'left',
|
73 |
vllm_flag: bool = False,
|
|
|
74 |
**kwargs,
|
75 |
) -> BatchFeature:
|
76 |
"""
|
@@ -150,8 +152,10 @@ class Qwen2TSProcessor(ProcessorMixin):
|
|
150 |
|
151 |
# Tokenize the processed prompt
|
152 |
tokenizer_outputs = {}
|
153 |
-
if self.tokenizer is not None:
|
154 |
tokenizer_outputs = self.tokenizer(reconstructed_prompts, padding=padding, padding_side=padding_side, **kwargs)
|
|
|
|
|
155 |
|
156 |
# Create the final output
|
157 |
outputs = tokenizer_outputs
|
@@ -162,6 +166,35 @@ class Qwen2TSProcessor(ProcessorMixin):
|
|
162 |
|
163 |
return BatchFeature(data=outputs)
|
164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
@property
|
166 |
def model_input_names(self):
|
167 |
"""
|
@@ -184,4 +217,4 @@ class Qwen2TSProcessor(ProcessorMixin):
|
|
184 |
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
|
185 |
the docstring of this method for more information.
|
186 |
"""
|
187 |
-
return self.tokenizer.decode(*args, **kwargs)
|
|
|
33 |
prompt (str): The placeholder string with offset and scaling info.
|
34 |
metadata (dict): Metadata containing the offset and scaling factor.
|
35 |
"""
|
36 |
+
timeseries = np.array(timeseries)
|
37 |
mean = np.mean(timeseries)
|
38 |
scaled_timeseries = timeseries - mean
|
39 |
scale_factor = 1.0
|
|
|
58 |
feature_extractor_class = None # You can add a feature extractor if needed
|
59 |
tokenizer_class = "AutoTokenizer"
|
60 |
|
61 |
+
def __init__(self, tokenizer=None, **kwargs):
|
62 |
"""
|
63 |
Args:
|
64 |
tokenizer: An optional tokenizer to process text prompts.
|
65 |
"""
|
66 |
+
super().__init__(tokenizer=tokenizer, **kwargs)
|
67 |
|
68 |
def __call__(
|
69 |
self,
|
|
|
72 |
padding: Union[bool, str, PaddingStrategy] = False,
|
73 |
padding_side: str = 'left',
|
74 |
vllm_flag: bool = False,
|
75 |
+
tokenize: bool = True,
|
76 |
**kwargs,
|
77 |
) -> BatchFeature:
|
78 |
"""
|
|
|
152 |
|
153 |
# Tokenize the processed prompt
|
154 |
tokenizer_outputs = {}
|
155 |
+
if tokenize and self.tokenizer is not None:
|
156 |
tokenizer_outputs = self.tokenizer(reconstructed_prompts, padding=padding, padding_side=padding_side, **kwargs)
|
157 |
+
else:
|
158 |
+
tokenizer_outputs = {"text": reconstructed_prompts}
|
159 |
|
160 |
# Create the final output
|
161 |
outputs = tokenizer_outputs
|
|
|
166 |
|
167 |
return BatchFeature(data=outputs)
|
168 |
|
169 |
+
def encode_timeseries(
|
170 |
+
self,
|
171 |
+
timeseries: Optional[List[List[np.ndarray]]] = None,
|
172 |
+
) -> np.ndarray:
|
173 |
+
if timeseries is None:
|
174 |
+
timeseries = []
|
175 |
+
|
176 |
+
concatenated_ts = None
|
177 |
+
encoded_ts_arrays = []
|
178 |
+
|
179 |
+
for i, ts in enumerate(timeseries):
|
180 |
+
encoded_ts, _, _ = sp_encoding(ts)
|
181 |
+
# Ensure time series shape [1, seq_len, feature_dim] for batch concatenation
|
182 |
+
encoded_ts_arrays.append(encoded_ts[None, ...])
|
183 |
+
|
184 |
+
if len(encoded_ts_arrays) > 0:
|
185 |
+
# Pad time series to the same length
|
186 |
+
max_length = max(ts.shape[1] for ts in encoded_ts_arrays)
|
187 |
+
padded_ts_arrays = [
|
188 |
+
np.pad(ts, ((0, 0), (0, max_length - ts.shape[1]), (0, 0)), mode="constant", constant_values=0.0)
|
189 |
+
for ts in encoded_ts_arrays
|
190 |
+
]
|
191 |
+
concatenated_ts = np.concatenate(padded_ts_arrays, axis=0) # Shape: [batch_size, max_length, feature_dim]
|
192 |
+
|
193 |
+
# Convert to torch
|
194 |
+
concatenated_ts = torch.from_numpy(concatenated_ts).half()
|
195 |
+
|
196 |
+
return concatenated_ts
|
197 |
+
|
198 |
@property
|
199 |
def model_input_names(self):
|
200 |
"""
|
|
|
217 |
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
|
218 |
the docstring of this method for more information.
|
219 |
"""
|
220 |
+
return self.tokenizer.decode(*args, **kwargs)
|