File size: 45,800 Bytes
446a2b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Token length: 726\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[1], line 118\u001b[0m\n\u001b[1;32m    116\u001b[0m \u001b[38;5;66;03m#tmp[\"verifier\"][0] = tmp[\"verifier\"][0].replace(\"veri.*\\(\", \"solver(\")\u001b[39;00m\n\u001b[1;32m    117\u001b[0m new_dict[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmessages\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mappend({\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrole\u001b[39m\u001b[38;5;124m\"\u001b[39m:\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124massistant\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcontent\u001b[39m\u001b[38;5;124m\"\u001b[39m:\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(tmp[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mverifier\u001b[39m\u001b[38;5;124m\"\u001b[39m])})\n\u001b[0;32m--> 118\u001b[0m token_len \u001b[38;5;241m=\u001b[39m \u001b[43mcheck_token_length\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnew_dict\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmessages\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    119\u001b[0m token_len_list\u001b[38;5;241m.\u001b[39mappend(token_len)\n\u001b[1;32m    120\u001b[0m data_output\u001b[38;5;241m.\u001b[39mappend(new_dict)\n",
      "Cell \u001b[0;32mIn[1], line 57\u001b[0m, in \u001b[0;36mcheck_token_length\u001b[0;34m(query)\u001b[0m\n\u001b[1;32m     53\u001b[0m \u001b[38;5;66;03m# Load the tokenizer\u001b[39;00m\n\u001b[1;32m     54\u001b[0m \n\u001b[1;32m     55\u001b[0m \u001b[38;5;66;03m# Apply chat template and tokenize\u001b[39;00m\n\u001b[1;32m     56\u001b[0m formatted_prompt \u001b[38;5;241m=\u001b[39m tokenizer\u001b[38;5;241m.\u001b[39mapply_chat_template(query, tokenize\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[0;32m---> 57\u001b[0m tokens \u001b[38;5;241m=\u001b[39m \u001b[43mtokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mformatted_prompt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreturn_tensors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mpt\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m     59\u001b[0m \u001b[38;5;66;03m# Get token length\u001b[39;00m\n\u001b[1;32m     60\u001b[0m token_length \u001b[38;5;241m=\u001b[39m tokens\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m1\u001b[39m]\n",
      "File \u001b[0;32m/usr/local/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2788\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.encode\u001b[0;34m(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, padding_side, return_tensors, **kwargs)\u001b[0m\n\u001b[1;32m   2750\u001b[0m \u001b[38;5;129m@add_end_docstrings\u001b[39m(\n\u001b[1;32m   2751\u001b[0m     ENCODE_KWARGS_DOCSTRING,\n\u001b[1;32m   2752\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   2771\u001b[0m     \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m   2772\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[\u001b[38;5;28mint\u001b[39m]:\n\u001b[1;32m   2773\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m   2774\u001b[0m \u001b[38;5;124;03m    Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.\u001b[39;00m\n\u001b[1;32m   2775\u001b[0m \n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   2786\u001b[0m \u001b[38;5;124;03m            method).\u001b[39;00m\n\u001b[1;32m   2787\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m-> 2788\u001b[0m     encoded_inputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode_plus\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   2789\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2790\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtext_pair\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtext_pair\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2791\u001b[0m \u001b[43m        \u001b[49m\u001b[43madd_special_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43madd_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2792\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpadding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpadding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2793\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtruncation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtruncation\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2794\u001b[0m \u001b[43m        \u001b[49m\u001b[43mmax_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2795\u001b[0m \u001b[43m        \u001b[49m\u001b[43mstride\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstride\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2796\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpadding_side\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpadding_side\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2797\u001b[0m \u001b[43m        \u001b[49m\u001b[43mreturn_tensors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_tensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2798\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2799\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2801\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m encoded_inputs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_ids\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
      "File \u001b[0;32m/usr/local/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:3207\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.encode_plus\u001b[0;34m(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, padding_side, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)\u001b[0m\n\u001b[1;32m   3197\u001b[0m \u001b[38;5;66;03m# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'\u001b[39;00m\n\u001b[1;32m   3198\u001b[0m padding_strategy, truncation_strategy, max_length, kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_padding_truncation_strategies(\n\u001b[1;32m   3199\u001b[0m     padding\u001b[38;5;241m=\u001b[39mpadding,\n\u001b[1;32m   3200\u001b[0m     truncation\u001b[38;5;241m=\u001b[39mtruncation,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   3204\u001b[0m     \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m   3205\u001b[0m )\n\u001b[0;32m-> 3207\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_encode_plus\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   3208\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtext\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3209\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtext_pair\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtext_pair\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3210\u001b[0m \u001b[43m    \u001b[49m\u001b[43madd_special_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43madd_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3211\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpadding_strategy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpadding_strategy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3212\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtruncation_strategy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtruncation_strategy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3213\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmax_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3214\u001b[0m \u001b[43m    \u001b[49m\u001b[43mstride\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstride\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3215\u001b[0m \u001b[43m    \u001b[49m\u001b[43mis_split_into_words\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_split_into_words\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3216\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpad_to_multiple_of\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpad_to_multiple_of\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3217\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpadding_side\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpadding_side\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3218\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_tensors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_tensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3219\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_token_type_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_token_type_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3220\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_attention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3221\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_overflowing_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_overflowing_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3222\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_special_tokens_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_special_tokens_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3223\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_offsets_mapping\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_offsets_mapping\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3224\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3225\u001b[0m \u001b[43m    \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3226\u001b[0m \u001b[43m    \u001b[49m\u001b[43msplit_special_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpop\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msplit_special_tokens\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msplit_special_tokens\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3227\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3228\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m/usr/local/lib/python3.10/site-packages/transformers/tokenization_utils_fast.py:603\u001b[0m, in \u001b[0;36mPreTrainedTokenizerFast._encode_plus\u001b[0;34m(self, text, text_pair, add_special_tokens, padding_strategy, truncation_strategy, max_length, stride, is_split_into_words, pad_to_multiple_of, padding_side, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, split_special_tokens, **kwargs)\u001b[0m\n\u001b[1;32m    579\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_encode_plus\u001b[39m(\n\u001b[1;32m    580\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m    581\u001b[0m     text: Union[TextInput, PreTokenizedInput],\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    600\u001b[0m     \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m    601\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m BatchEncoding:\n\u001b[1;32m    602\u001b[0m     batched_input \u001b[38;5;241m=\u001b[39m [(text, text_pair)] \u001b[38;5;28;01mif\u001b[39;00m text_pair \u001b[38;5;28;01melse\u001b[39;00m [text]\n\u001b[0;32m--> 603\u001b[0m     batched_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_batch_encode_plus\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    604\u001b[0m \u001b[43m        \u001b[49m\u001b[43mbatched_input\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    605\u001b[0m \u001b[43m        \u001b[49m\u001b[43mis_split_into_words\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_split_into_words\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    606\u001b[0m \u001b[43m        \u001b[49m\u001b[43madd_special_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43madd_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    607\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpadding_strategy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpadding_strategy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    608\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtruncation_strategy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtruncation_strategy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    609\u001b[0m \u001b[43m        \u001b[49m\u001b[43mmax_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    610\u001b[0m \u001b[43m        \u001b[49m\u001b[43mstride\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstride\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    611\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpad_to_multiple_of\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpad_to_multiple_of\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    612\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpadding_side\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpadding_side\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    613\u001b[0m \u001b[43m        \u001b[49m\u001b[43mreturn_tensors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_tensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    614\u001b[0m \u001b[43m        \u001b[49m\u001b[43mreturn_token_type_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_token_type_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    615\u001b[0m \u001b[43m        \u001b[49m\u001b[43mreturn_attention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    616\u001b[0m \u001b[43m        \u001b[49m\u001b[43mreturn_overflowing_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_overflowing_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    617\u001b[0m \u001b[43m        \u001b[49m\u001b[43mreturn_special_tokens_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_special_tokens_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    618\u001b[0m \u001b[43m        \u001b[49m\u001b[43mreturn_offsets_mapping\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_offsets_mapping\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    619\u001b[0m \u001b[43m        \u001b[49m\u001b[43mreturn_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    620\u001b[0m \u001b[43m        \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    621\u001b[0m \u001b[43m        \u001b[49m\u001b[43msplit_special_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msplit_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    622\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    623\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    625\u001b[0m     \u001b[38;5;66;03m# Return tensor is None, then we can remove the leading batch axis\u001b[39;00m\n\u001b[1;32m    626\u001b[0m     \u001b[38;5;66;03m# Overflowing tokens are returned as a batch of output so we keep them in this case\u001b[39;00m\n\u001b[1;32m    627\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m return_tensors \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m return_overflowing_tokens:\n",
      "File \u001b[0;32m/usr/local/lib/python3.10/site-packages/transformers/tokenization_utils_fast.py:529\u001b[0m, in \u001b[0;36mPreTrainedTokenizerFast._batch_encode_plus\u001b[0;34m(self, batch_text_or_text_pairs, add_special_tokens, padding_strategy, truncation_strategy, max_length, stride, is_split_into_words, pad_to_multiple_of, padding_side, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, split_special_tokens)\u001b[0m\n\u001b[1;32m    526\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_tokenizer\u001b[38;5;241m.\u001b[39mencode_special_tokens \u001b[38;5;241m!=\u001b[39m split_special_tokens:\n\u001b[1;32m    527\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_tokenizer\u001b[38;5;241m.\u001b[39mencode_special_tokens \u001b[38;5;241m=\u001b[39m split_special_tokens\n\u001b[0;32m--> 529\u001b[0m encodings \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_tokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode_batch\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    530\u001b[0m \u001b[43m    \u001b[49m\u001b[43mbatch_text_or_text_pairs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    531\u001b[0m \u001b[43m    \u001b[49m\u001b[43madd_special_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43madd_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    532\u001b[0m \u001b[43m    \u001b[49m\u001b[43mis_pretokenized\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_split_into_words\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    533\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    535\u001b[0m \u001b[38;5;66;03m# Convert encoding to dict\u001b[39;00m\n\u001b[1;32m    536\u001b[0m \u001b[38;5;66;03m# `Tokens` has type: Tuple[\u001b[39;00m\n\u001b[1;32m    537\u001b[0m \u001b[38;5;66;03m#                       List[Dict[str, List[List[int]]]] or List[Dict[str, 2D-Tensor]],\u001b[39;00m\n\u001b[1;32m    538\u001b[0m \u001b[38;5;66;03m#                       List[EncodingFast]\u001b[39;00m\n\u001b[1;32m    539\u001b[0m \u001b[38;5;66;03m#                    ]\u001b[39;00m\n\u001b[1;32m    540\u001b[0m \u001b[38;5;66;03m# with nested dimensions corresponding to batch, overflows, sequence length\u001b[39;00m\n\u001b[1;32m    541\u001b[0m tokens_and_encodings \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m    542\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_convert_encoding(\n\u001b[1;32m    543\u001b[0m         encoding\u001b[38;5;241m=\u001b[39mencoding,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    552\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m encoding \u001b[38;5;129;01min\u001b[39;00m encodings\n\u001b[1;32m    553\u001b[0m ]\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "import json\n",
    "from transformers import AutoTokenizer\n",
    "from transformers.utils import logging\n",
    "import torch\n",
    "model_name = \"/mnt/data/zifeng.cao/reasoning/arc-agi/LLaMA-Factory/saves/Qwen2.5-Coder-7B-Instruct/pt_output_plus_step_output/checkpoint-274\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
    "\n",
    "system_prompt = '''You are a helpful assistant that can solve reasoning tasks by using a limited set of DSL functions that are implemented in Python. \n",
    "*** Task description ***\n",
    "- Each task consists of around a handful of training examples, where an training example consists of an input grid and an output grid. \n",
    "- For each training example, the output grid is the result of applying the same task-specific transformation to the input grid. \n",
    "- The goal is to infer the transformation from the few training examples.\n",
    "- The transformation is a task-specific grid transformation, which can be decomposed into a sequence of the DSL functions.\n",
    "*** DSL description ***\n",
    "- Types and Constants\n",
    "  - **Types**: Define various data types like `Grid`, `Object`, `Indices`, and more to facilitate grid operations.\n",
    "  - **Constants**: Include color constants (e.g., `ZERO`, `ONE`), boolean constants (`T`, `F`), and directional vectors (e.g., `UP`, `DOWN`).\n",
    "- Primitives\n",
    "  - **Math Operations**: Functions like `add`, `subtract`, `multiply`, and `divide` perform basic arithmetic on integers or tuples.\n",
    "  - **Logical Operations**: Functions such as `even`, `flip`, and `both` handle logical evaluations.\n",
    "  - **Data Operations**: Functions like `identity`, `order`, `merge`, `difference`, and `dedupe` manage data containers.\n",
    "- Grid and Object Manipulation\n",
    "  - **Grid Creation**: `canvas` creates grids with specified dimensions and values.\n",
    "  - **Grid Transformation**: Functions like `rot90`, `hmirror`, `upscale`, and `downscale` transform grids in various ways.\n",
    "  - **Subgrid Operations**: `crop`, `hsplit`, `vsplit`, and `trim` extract or modify parts of grids.\n",
    "  - **Object and Patch Handling**: Functions like `objects`, `normalize`, `shift`, `toindices`, and `recolor` handle grid patches and objects.\n",
    "- Analysis and Filtering\n",
    "  - **Color Analysis**: Functions such as `mostcolor`, `leastcolor`, `colorcount`, and `palette` analyze color distributions.\n",
    "  - **Object Filtering**: `colorfilter` and `sizefilter` filter objects by color or size.\n",
    "  - **Spatial Analysis**: Functions like `center`, `position`, `manhattan`, and `adjacent` analyze spatial relationships.\n",
    "- Connectivity and Bounding\n",
    "  - **Connectivity**: `connect`, `neighbors`, `dneighbors`, and `ineighbors` determine connections between grid indices.\n",
    "  - **Bounding**: Functions like `box`, `inbox`, `outbox`, and `corners` manage bounding areas of patches.\n",
    "- Utils\n",
    "  - **Random Integer Generation**: `unifint` generates random integers within specified bounds and difficulty levels.\n",
    "  - **Grid Validation**: `is_grid` checks if an input is a valid grid.\n",
    "  - **Grid Formatting**: `format_grid` casts lists to the grid type.\n",
    "*** Format of the generated code ***\n",
    "- The only allowed operations are storing the result of a function call in a variable, where all arguments must either be the input grid, some constants such as integers or common vectors indicating directions, or a variable previously computed within the same solver, and each function that is being called must either be a DSL function or a variable previously constructed within the same solver. \n",
    "- This also means that each line of code is enforced to be a single function call.\n",
    "So, you are given a task and a set of examples, you need to generate a code that can solve the task.\n",
    "'''\n",
    "\n",
    "token_length = tokenizer.encode(system_prompt, return_tensors=\"pt\").shape[1]\n",
    "print(f\"Token length: {token_length}\")\n",
    "\n",
    "\n",
    "\n",
    "def check_token_length(query):\n",
    "    # Suppress warnings\n",
    "    logging.set_verbosity_error()\n",
    "\n",
    "    # Load the tokenizer\n",
    "\n",
    "    # Apply chat template and tokenize\n",
    "    formatted_prompt = tokenizer.apply_chat_template(query, tokenize=False)\n",
    "    tokens = tokenizer.encode(formatted_prompt, return_tensors=\"pt\")\n",
    "\n",
    "    # Get token length\n",
    "    token_length = tokens.shape[1]\n",
    "\n",
    "    #print(f\"Token length: {token_length}\")\n",
    "\n",
    "    # Check if it exceeds the model's context length (assuming 4096 for Llama-2)\n",
    "\n",
    "    return token_length\n",
    "def list_of_lists_to_string_with_commas_and_newlines(list_of_lists):\n",
    "    return '\\n'.join(','.join(str(item) for item in sublist) for sublist in list_of_lists)\n",
    "\n",
    "def transform_query(query):\n",
    "    result_str = \"\"\n",
    "    previous_result = \"\"\n",
    "    for i, example in enumerate(query):\n",
    "        try:\n",
    "            r_i, c_i = len(example[\"input\"]), len(example[\"input\"][0])\n",
    "            r_o, c_o = len(example[\"output\"]), len(example[\"output\"][0])\n",
    "        except:\n",
    "            print(example)\n",
    "            return None\n",
    "        # input_str = \"\\n\".join([\"|\".join(map(str, row)) for row in example[\"input\"]])\n",
    "        # output_str = \"\\n\".join([\"|\".join(map(str, row)) for row in example[\"output\"]])\n",
    "        input_str = list_of_lists_to_string_with_commas_and_newlines(example[\"input\"])\n",
    "        output_str = list_of_lists_to_string_with_commas_and_newlines(example[\"output\"])\n",
    "        result_str = previous_result + f\"** Example {i+1} ** \\n input: ({r_i} by {c_i}) Matrix \\n{input_str}\\n output: ({r_o} by {c_o}) Matrix \\n{output_str}\\n\\n\"\n",
    "        previous_result = result_str\n",
    "        if len(result_str) > 14000: #6000\n",
    "            token_length = tokenizer.encode(previous_result, return_tensors=\"pt\").shape[1]\n",
    "            #print(f\"previous Token length: {token_length}\")\n",
    "            if token_length > 14000: #6000\n",
    "                return None\n",
    "            return previous_result\n",
    "    token_length = tokenizer.encode(result_str, return_tensors=\"pt\").shape[1]\n",
    "    #print(i, len(result_str))\n",
    "    #print(f\"Token length: {token_length}\")\n",
    "    \n",
    "    return result_str\n",
    "\n",
    "import re\n",
    "f = open(\"multi_step_verifiers_training.txt\", \"r\")\n",
    "data_output = []\n",
    "token_len_list = []\n",
    "skip = 0\n",
    "total = 0\n",
    "for line in f:\n",
    "    total += 1\n",
    "    tmp = json.loads(line)\n",
    "    new_dict = {\"messages\":[]}\n",
    "    new_dict[\"messages\"].append({\"role\":\"system\", \"content\":system_prompt})\n",
    "    tran = transform_query(tmp[\"example\"])   \n",
    "    if tran == None:\n",
    "        skip += 1\n",
    "        continue\n",
    "\n",
    "    new_dict[\"messages\"].append({\"role\":\"user\", \"content\":tran})\n",
    "    tmp[\"verifier\"][0] =re.sub(r'veri.*?\\(', 'solver(', tmp[\"verifier\"][0])\n",
    "    #tmp[\"verifier\"][0] = tmp[\"verifier\"][0].replace(\"veri.*\\(\", \"solver(\")\n",
    "    new_dict[\"messages\"].append({\"role\":\"assistant\", \"content\":\"\\n\".join(tmp[\"verifier\"])})\n",
    "    token_len = check_token_length(new_dict[\"messages\"])\n",
    "    token_len_list.append(token_len)\n",
    "    data_output.append(new_dict)\n",
    "print(data_output[0][\"messages\"][1][\"content\"])\n",
    "print(\"skip:\", skip)\n",
    "print(\"total:\", total)\n",
    "print(\"Token length max:\", max(token_len_list))\n",
    "print(\"Token length min:\", min(token_len_list))\n",
    "f.close()\n",
    "with open(\"multi_step_verifiers_training.json\", \"w\") as f:\n",
    "    json.dump(data_output, f, indent=4)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Token length: 726\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      " 72%|███████▏  | 96270/134580 [14:57<05:57, 107.26it/s]\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mIndexError\u001b[0m                                Traceback (most recent call last)",
      "File \u001b[0;32m/usr/local/lib/python3.10/multiprocessing/pool.py:856\u001b[0m, in \u001b[0;36mIMapIterator.next\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    855\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 856\u001b[0m     item \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_items\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpopleft\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    857\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mIndexError\u001b[39;00m:\n",
      "\u001b[0;31mIndexError\u001b[0m: pop from an empty deque",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[2], line 136\u001b[0m\n\u001b[1;32m    133\u001b[0m         json\u001b[38;5;241m.\u001b[39mdump(data_output, f, indent\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m4\u001b[39m)\n\u001b[1;32m    135\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;18m__name__\u001b[39m \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m__main__\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[0;32m--> 136\u001b[0m     \u001b[43mmain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
      "Cell \u001b[0;32mIn[2], line 115\u001b[0m, in \u001b[0;36mmain\u001b[0;34m()\u001b[0m\n\u001b[1;32m    112\u001b[0m \u001b[38;5;66;03m# Create a pool of workers\u001b[39;00m\n\u001b[1;32m    113\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m mp\u001b[38;5;241m.\u001b[39mPool(processes\u001b[38;5;241m=\u001b[39mmp\u001b[38;5;241m.\u001b[39mcpu_count()) \u001b[38;5;28;01mas\u001b[39;00m pool:\n\u001b[1;32m    114\u001b[0m     \u001b[38;5;66;03m# Process lines in parallel with progress bar\u001b[39;00m\n\u001b[0;32m--> 115\u001b[0m     results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mtqdm\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpool\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mimap\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprocess_line\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlines\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtotal\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtotal\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    117\u001b[0m     \u001b[38;5;66;03m# Collect results\u001b[39;00m\n\u001b[1;32m    118\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m result \u001b[38;5;129;01min\u001b[39;00m results:\n",
      "File \u001b[0;32m/usr/local/lib/python3.10/site-packages/tqdm/std.py:1181\u001b[0m, in \u001b[0;36mtqdm.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1178\u001b[0m time \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_time\n\u001b[1;32m   1180\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1181\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m obj \u001b[38;5;129;01min\u001b[39;00m iterable:\n\u001b[1;32m   1182\u001b[0m         \u001b[38;5;28;01myield\u001b[39;00m obj\n\u001b[1;32m   1183\u001b[0m         \u001b[38;5;66;03m# Update and possibly print the progressbar.\u001b[39;00m\n\u001b[1;32m   1184\u001b[0m         \u001b[38;5;66;03m# Note: does not call self.update(1) for speed optimisation.\u001b[39;00m\n",
      "File \u001b[0;32m/usr/local/lib/python3.10/multiprocessing/pool.py:861\u001b[0m, in \u001b[0;36mIMapIterator.next\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    859\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pool \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    860\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m--> 861\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_cond\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    862\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    863\u001b[0m     item \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_items\u001b[38;5;241m.\u001b[39mpopleft()\n",
      "File \u001b[0;32m/usr/local/lib/python3.10/threading.py:320\u001b[0m, in \u001b[0;36mCondition.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    318\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:    \u001b[38;5;66;03m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[39;00m\n\u001b[1;32m    319\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 320\u001b[0m         \u001b[43mwaiter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43macquire\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    321\u001b[0m         gotit \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m    322\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "import json\n",
    "from transformers import AutoTokenizer\n",
    "from transformers.utils import logging\n",
    "import torch\n",
    "import multiprocessing as mp\n",
    "from tqdm import tqdm\n",
    "\n",
    "model_name = \"/mnt/data/zifeng.cao/reasoning/arc-agi/LLaMA-Factory/saves/Qwen2.5-Coder-7B-Instruct/pt_output_plus_step_output/checkpoint-274\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
    "\n",
    "system_prompt = '''You are a helpful assistant that can solve reasoning tasks by using a limited set of DSL functions that are implemented in Python. \n",
    "*** Task description ***\n",
    "- Each task consists of around a handful of training examples, where an training example consists of an input grid and an output grid. \n",
    "- For each training example, the output grid is the result of applying the same task-specific transformation to the input grid. \n",
    "- The goal is to infer the transformation from the few training examples.\n",
    "- The transformation is a task-specific grid transformation, which can be decomposed into a sequence of the DSL functions.\n",
    "*** DSL description ***\n",
    "- Types and Constants\n",
    "  - **Types**: Define various data types like `Grid`, `Object`, `Indices`, and more to facilitate grid operations.\n",
    "  - **Constants**: Include color constants (e.g., `ZERO`, `ONE`), boolean constants (`T`, `F`), and directional vectors (e.g., `UP`, `DOWN`).\n",
    "- Primitives\n",
    "  - **Math Operations**: Functions like `add`, `subtract`, `multiply`, and `divide` perform basic arithmetic on integers or tuples.\n",
    "  - **Logical Operations**: Functions such as `even`, `flip`, and `both` handle logical evaluations.\n",
    "  - **Data Operations**: Functions like `identity`, `order`, `merge`, `difference`, and `dedupe` manage data containers.\n",
    "- Grid and Object Manipulation\n",
    "  - **Grid Creation**: `canvas` creates grids with specified dimensions and values.\n",
    "  - **Grid Transformation**: Functions like `rot90`, `hmirror`, `upscale`, and `downscale` transform grids in various ways.\n",
    "  - **Subgrid Operations**: `crop`, `hsplit`, `vsplit`, and `trim` extract or modify parts of grids.\n",
    "  - **Object and Patch Handling**: Functions like `objects`, `normalize`, `shift`, `toindices`, and `recolor` handle grid patches and objects.\n",
    "- Analysis and Filtering\n",
    "  - **Color Analysis**: Functions such as `mostcolor`, `leastcolor`, `colorcount`, and `palette` analyze color distributions.\n",
    "  - **Object Filtering**: `colorfilter` and `sizefilter` filter objects by color or size.\n",
    "  - **Spatial Analysis**: Functions like `center`, `position`, `manhattan`, and `adjacent` analyze spatial relationships.\n",
    "- Connectivity and Bounding\n",
    "  - **Connectivity**: `connect`, `neighbors`, `dneighbors`, and `ineighbors` determine connections between grid indices.\n",
    "  - **Bounding**: Functions like `box`, `inbox`, `outbox`, and `corners` manage bounding areas of patches.\n",
    "- Utils\n",
    "  - **Random Integer Generation**: `unifint` generates random integers within specified bounds and difficulty levels.\n",
    "  - **Grid Validation**: `is_grid` checks if an input is a valid grid.\n",
    "  - **Grid Formatting**: `format_grid` casts lists to the grid type.\n",
    "*** Format of the generated code ***\n",
    "- The only allowed operations are storing the result of a function call in a variable, where all arguments must either be the input grid, some constants such as integers or common vectors indicating directions, or a variable previously computed within the same solver, and each function that is being called must either be a DSL function or a variable previously constructed within the same solver. \n",
    "- This also means that each line of code is enforced to be a single function call.\n",
    "So, you are given a task and a set of examples, you need to generate a code that can solve the task.\n",
    "'''\n",
    "\n",
    "token_length = tokenizer.encode(system_prompt, return_tensors=\"pt\").shape[1]\n",
    "print(f\"Token length: {token_length}\")\n",
    "\n",
    "def check_token_length(query):\n",
    "    # Suppress warnings\n",
    "    logging.set_verbosity_error()\n",
    "\n",
    "    # Apply chat template and tokenize\n",
    "    formatted_prompt = tokenizer.apply_chat_template(query, tokenize=False)\n",
    "    tokens = tokenizer.encode(formatted_prompt, return_tensors=\"pt\")\n",
    "\n",
    "    # Get token length\n",
    "    token_length = tokens.shape[1]\n",
    "\n",
    "    return token_length\n",
    "\n",
    "def list_of_lists_to_string_with_commas_and_newlines(list_of_lists):\n",
    "    return '\\n'.join(','.join(str(item) for item in sublist) for sublist in list_of_lists)\n",
    "\n",
    "def transform_query(query):\n",
    "    result_str = \"\"\n",
    "    previous_result = \"\"\n",
    "    for i, example in enumerate(query):\n",
    "        try:\n",
    "            r_i, c_i = len(example[\"input\"]), len(example[\"input\"][0])\n",
    "            r_o, c_o = len(example[\"output\"]), len(example[\"output\"][0])\n",
    "        except:\n",
    "            print(example)\n",
    "            return None\n",
    "        input_str = list_of_lists_to_string_with_commas_and_newlines(example[\"input\"])\n",
    "        output_str = list_of_lists_to_string_with_commas_and_newlines(example[\"output\"])\n",
    "        result_str = previous_result + f\"** Example {i+1} ** \\n input: ({r_i} by {c_i}) Matrix \\n{input_str}\\n output: ({r_o} by {c_o}) Matrix \\n{output_str}\\n\\n\"\n",
    "        previous_result = result_str\n",
    "        if len(result_str) > 14000:\n",
    "            token_length = tokenizer.encode(previous_result, return_tensors=\"pt\").shape[1]\n",
    "            if token_length > 14000:\n",
    "                return None\n",
    "            return previous_result\n",
    "    token_length = tokenizer.encode(result_str, return_tensors=\"pt\").shape[1]\n",
    "    \n",
    "    return result_str\n",
    "\n",
    "def process_line(line):\n",
    "    tmp = json.loads(line)\n",
    "    new_dict = {\"messages\":[]}\n",
    "    new_dict[\"messages\"].append({\"role\":\"system\", \"content\":system_prompt})\n",
    "    tran = transform_query(tmp[\"example\"])   \n",
    "    if tran == None:\n",
    "        return None\n",
    "\n",
    "    new_dict[\"messages\"].append({\"role\":\"user\", \"content\":tran})\n",
    "    tmp[\"verifier\"][0] = re.sub(r'veri.*?\\(', 'solver(', tmp[\"verifier\"][0])\n",
    "    new_dict[\"messages\"].append({\"role\":\"assistant\", \"content\":\"\\n\".join(tmp[\"verifier\"])})\n",
    "    token_len = check_token_length(new_dict[\"messages\"])\n",
    "    return new_dict, token_len\n",
    "\n",
    "def main():\n",
    "    with open(\"multi_step_verifiers_training.txt\", \"r\") as f:\n",
    "        lines = f.readlines()\n",
    "    \n",
    "    total = len(lines)\n",
    "    data_output = []\n",
    "    token_len_list = []\n",
    "    skip = 0\n",
    "    \n",
    "    # Create a pool of workers\n",
    "    with mp.Pool(processes=mp.cpu_count()) as pool:\n",
    "        # Process lines in parallel with progress bar\n",
    "        results = list(tqdm(pool.imap(process_line, lines), total=total))\n",
    "        \n",
    "        # Collect results\n",
    "        for result in results:\n",
    "            if result is None:\n",
    "                skip += 1\n",
    "                continue\n",
    "            new_dict, token_len = result\n",
    "            data_output.append(new_dict)\n",
    "            token_len_list.append(token_len)\n",
    "    \n",
    "    print(data_output[0][\"messages\"][1][\"content\"])\n",
    "    print(\"skip:\", skip)\n",
    "    print(\"total:\", total)\n",
    "    print(\"Token length max:\", max(token_len_list))\n",
    "    print(\"Token length min:\", min(token_len_list))\n",
    "    \n",
    "    with open(\"multi_step_verifiers_training.json\", \"w\") as f:\n",
    "        json.dump(data_output, f, indent=4)\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    main()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'def verifier_9f6b5f41(I: Grid) -> Grid:'"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import re\n",
    "\n",
    "def replace_verifier_with_solver(input_string):\n",
    "    return re.sub(r'veri.*?\\(', 'solver(', input_string)\n",
    "\n",
    "original_string = 'def verifier_9f6b5f41(I: Grid) -> Grid:'\n",
    "modified_string = replace_verifier_with_solver(original_string)\n",
    "print(modified_string)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "# merge two datasets\n",
    "data1 = json.load(open(\"multi_step_verifiers_training.json\", \"r\"))\n",
    "data2 = json.load(open(\"re_arc_v4.json\", \"r\"))\n",
    "data1.extend(data2)\n",
    "with open(\"multi_step_merged_arc_v4.json\", \"w\") as f:\n",
    "    json.dump(data1, f, indent=4)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}