merve HF Staff commited on
Commit
8b9c168
·
verified ·
1 Parent(s): fe8c001
Files changed (1) hide show
  1. DINOv3_FT.ipynb +1774 -0
DINOv3_FT.ipynb ADDED
@@ -0,0 +1,1774 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "machine_shape": "hm",
8
+ "gpuType": "L4"
9
+ },
10
+ "kernelspec": {
11
+ "name": "python3",
12
+ "display_name": "Python 3"
13
+ },
14
+ "language_info": {
15
+ "name": "python"
16
+ },
17
+ "accelerator": "GPU",
18
+ "widgets": {
19
+ "application/vnd.jupyter.widget-state+json": {
20
+ "32138245d41348928cc5b5834b07cb7e": {
21
+ "model_module": "@jupyter-widgets/controls",
22
+ "model_name": "HBoxModel",
23
+ "model_module_version": "1.5.0",
24
+ "state": {
25
+ "_dom_classes": [],
26
+ "_model_module": "@jupyter-widgets/controls",
27
+ "_model_module_version": "1.5.0",
28
+ "_model_name": "HBoxModel",
29
+ "_view_count": null,
30
+ "_view_module": "@jupyter-widgets/controls",
31
+ "_view_module_version": "1.5.0",
32
+ "_view_name": "HBoxView",
33
+ "box_style": "",
34
+ "children": [
35
+ "IPY_MODEL_df6de04fdb204d348767dd0b2d0e88f7",
36
+ "IPY_MODEL_63a3800d62dd41d6b4a3f643a8930d95",
37
+ "IPY_MODEL_49d67bd205184874a5cee04d318d91fe"
38
+ ],
39
+ "layout": "IPY_MODEL_f00ace964f96471b9eb839cce48ce378"
40
+ }
41
+ },
42
+ "df6de04fdb204d348767dd0b2d0e88f7": {
43
+ "model_module": "@jupyter-widgets/controls",
44
+ "model_name": "HTMLModel",
45
+ "model_module_version": "1.5.0",
46
+ "state": {
47
+ "_dom_classes": [],
48
+ "_model_module": "@jupyter-widgets/controls",
49
+ "_model_module_version": "1.5.0",
50
+ "_model_name": "HTMLModel",
51
+ "_view_count": null,
52
+ "_view_module": "@jupyter-widgets/controls",
53
+ "_view_module_version": "1.5.0",
54
+ "_view_name": "HTMLView",
55
+ "description": "",
56
+ "description_tooltip": null,
57
+ "layout": "IPY_MODEL_3ad0ac8def244930a3aff41d68a88a65",
58
+ "placeholder": "​",
59
+ "style": "IPY_MODEL_7464841c193d492685bb929b1c0d230c",
60
+ "value": "preprocessor_config.json: 100%"
61
+ }
62
+ },
63
+ "63a3800d62dd41d6b4a3f643a8930d95": {
64
+ "model_module": "@jupyter-widgets/controls",
65
+ "model_name": "FloatProgressModel",
66
+ "model_module_version": "1.5.0",
67
+ "state": {
68
+ "_dom_classes": [],
69
+ "_model_module": "@jupyter-widgets/controls",
70
+ "_model_module_version": "1.5.0",
71
+ "_model_name": "FloatProgressModel",
72
+ "_view_count": null,
73
+ "_view_module": "@jupyter-widgets/controls",
74
+ "_view_module_version": "1.5.0",
75
+ "_view_name": "ProgressView",
76
+ "bar_style": "success",
77
+ "description": "",
78
+ "description_tooltip": null,
79
+ "layout": "IPY_MODEL_5c16553a2ff34a37a2cb62b4a4c42a6f",
80
+ "max": 585,
81
+ "min": 0,
82
+ "orientation": "horizontal",
83
+ "style": "IPY_MODEL_34be83ddb4bf43e58cadbcbac5a606b7",
84
+ "value": 585
85
+ }
86
+ },
87
+ "49d67bd205184874a5cee04d318d91fe": {
88
+ "model_module": "@jupyter-widgets/controls",
89
+ "model_name": "HTMLModel",
90
+ "model_module_version": "1.5.0",
91
+ "state": {
92
+ "_dom_classes": [],
93
+ "_model_module": "@jupyter-widgets/controls",
94
+ "_model_module_version": "1.5.0",
95
+ "_model_name": "HTMLModel",
96
+ "_view_count": null,
97
+ "_view_module": "@jupyter-widgets/controls",
98
+ "_view_module_version": "1.5.0",
99
+ "_view_name": "HTMLView",
100
+ "description": "",
101
+ "description_tooltip": null,
102
+ "layout": "IPY_MODEL_0ce7bd7e52074f29b446ef2d4dd0921a",
103
+ "placeholder": "​",
104
+ "style": "IPY_MODEL_7e2178d696c04d5787e736ace9ab57c0",
105
+ "value": " 585/585 [00:00<00:00, 65.6kB/s]"
106
+ }
107
+ },
108
+ "f00ace964f96471b9eb839cce48ce378": {
109
+ "model_module": "@jupyter-widgets/base",
110
+ "model_name": "LayoutModel",
111
+ "model_module_version": "1.2.0",
112
+ "state": {
113
+ "_model_module": "@jupyter-widgets/base",
114
+ "_model_module_version": "1.2.0",
115
+ "_model_name": "LayoutModel",
116
+ "_view_count": null,
117
+ "_view_module": "@jupyter-widgets/base",
118
+ "_view_module_version": "1.2.0",
119
+ "_view_name": "LayoutView",
120
+ "align_content": null,
121
+ "align_items": null,
122
+ "align_self": null,
123
+ "border": null,
124
+ "bottom": null,
125
+ "display": null,
126
+ "flex": null,
127
+ "flex_flow": null,
128
+ "grid_area": null,
129
+ "grid_auto_columns": null,
130
+ "grid_auto_flow": null,
131
+ "grid_auto_rows": null,
132
+ "grid_column": null,
133
+ "grid_gap": null,
134
+ "grid_row": null,
135
+ "grid_template_areas": null,
136
+ "grid_template_columns": null,
137
+ "grid_template_rows": null,
138
+ "height": null,
139
+ "justify_content": null,
140
+ "justify_items": null,
141
+ "left": null,
142
+ "margin": null,
143
+ "max_height": null,
144
+ "max_width": null,
145
+ "min_height": null,
146
+ "min_width": null,
147
+ "object_fit": null,
148
+ "object_position": null,
149
+ "order": null,
150
+ "overflow": null,
151
+ "overflow_x": null,
152
+ "overflow_y": null,
153
+ "padding": null,
154
+ "right": null,
155
+ "top": null,
156
+ "visibility": null,
157
+ "width": null
158
+ }
159
+ },
160
+ "3ad0ac8def244930a3aff41d68a88a65": {
161
+ "model_module": "@jupyter-widgets/base",
162
+ "model_name": "LayoutModel",
163
+ "model_module_version": "1.2.0",
164
+ "state": {
165
+ "_model_module": "@jupyter-widgets/base",
166
+ "_model_module_version": "1.2.0",
167
+ "_model_name": "LayoutModel",
168
+ "_view_count": null,
169
+ "_view_module": "@jupyter-widgets/base",
170
+ "_view_module_version": "1.2.0",
171
+ "_view_name": "LayoutView",
172
+ "align_content": null,
173
+ "align_items": null,
174
+ "align_self": null,
175
+ "border": null,
176
+ "bottom": null,
177
+ "display": null,
178
+ "flex": null,
179
+ "flex_flow": null,
180
+ "grid_area": null,
181
+ "grid_auto_columns": null,
182
+ "grid_auto_flow": null,
183
+ "grid_auto_rows": null,
184
+ "grid_column": null,
185
+ "grid_gap": null,
186
+ "grid_row": null,
187
+ "grid_template_areas": null,
188
+ "grid_template_columns": null,
189
+ "grid_template_rows": null,
190
+ "height": null,
191
+ "justify_content": null,
192
+ "justify_items": null,
193
+ "left": null,
194
+ "margin": null,
195
+ "max_height": null,
196
+ "max_width": null,
197
+ "min_height": null,
198
+ "min_width": null,
199
+ "object_fit": null,
200
+ "object_position": null,
201
+ "order": null,
202
+ "overflow": null,
203
+ "overflow_x": null,
204
+ "overflow_y": null,
205
+ "padding": null,
206
+ "right": null,
207
+ "top": null,
208
+ "visibility": null,
209
+ "width": null
210
+ }
211
+ },
212
+ "7464841c193d492685bb929b1c0d230c": {
213
+ "model_module": "@jupyter-widgets/controls",
214
+ "model_name": "DescriptionStyleModel",
215
+ "model_module_version": "1.5.0",
216
+ "state": {
217
+ "_model_module": "@jupyter-widgets/controls",
218
+ "_model_module_version": "1.5.0",
219
+ "_model_name": "DescriptionStyleModel",
220
+ "_view_count": null,
221
+ "_view_module": "@jupyter-widgets/base",
222
+ "_view_module_version": "1.2.0",
223
+ "_view_name": "StyleView",
224
+ "description_width": ""
225
+ }
226
+ },
227
+ "5c16553a2ff34a37a2cb62b4a4c42a6f": {
228
+ "model_module": "@jupyter-widgets/base",
229
+ "model_name": "LayoutModel",
230
+ "model_module_version": "1.2.0",
231
+ "state": {
232
+ "_model_module": "@jupyter-widgets/base",
233
+ "_model_module_version": "1.2.0",
234
+ "_model_name": "LayoutModel",
235
+ "_view_count": null,
236
+ "_view_module": "@jupyter-widgets/base",
237
+ "_view_module_version": "1.2.0",
238
+ "_view_name": "LayoutView",
239
+ "align_content": null,
240
+ "align_items": null,
241
+ "align_self": null,
242
+ "border": null,
243
+ "bottom": null,
244
+ "display": null,
245
+ "flex": null,
246
+ "flex_flow": null,
247
+ "grid_area": null,
248
+ "grid_auto_columns": null,
249
+ "grid_auto_flow": null,
250
+ "grid_auto_rows": null,
251
+ "grid_column": null,
252
+ "grid_gap": null,
253
+ "grid_row": null,
254
+ "grid_template_areas": null,
255
+ "grid_template_columns": null,
256
+ "grid_template_rows": null,
257
+ "height": null,
258
+ "justify_content": null,
259
+ "justify_items": null,
260
+ "left": null,
261
+ "margin": null,
262
+ "max_height": null,
263
+ "max_width": null,
264
+ "min_height": null,
265
+ "min_width": null,
266
+ "object_fit": null,
267
+ "object_position": null,
268
+ "order": null,
269
+ "overflow": null,
270
+ "overflow_x": null,
271
+ "overflow_y": null,
272
+ "padding": null,
273
+ "right": null,
274
+ "top": null,
275
+ "visibility": null,
276
+ "width": null
277
+ }
278
+ },
279
+ "34be83ddb4bf43e58cadbcbac5a606b7": {
280
+ "model_module": "@jupyter-widgets/controls",
281
+ "model_name": "ProgressStyleModel",
282
+ "model_module_version": "1.5.0",
283
+ "state": {
284
+ "_model_module": "@jupyter-widgets/controls",
285
+ "_model_module_version": "1.5.0",
286
+ "_model_name": "ProgressStyleModel",
287
+ "_view_count": null,
288
+ "_view_module": "@jupyter-widgets/base",
289
+ "_view_module_version": "1.2.0",
290
+ "_view_name": "StyleView",
291
+ "bar_color": null,
292
+ "description_width": ""
293
+ }
294
+ },
295
+ "0ce7bd7e52074f29b446ef2d4dd0921a": {
296
+ "model_module": "@jupyter-widgets/base",
297
+ "model_name": "LayoutModel",
298
+ "model_module_version": "1.2.0",
299
+ "state": {
300
+ "_model_module": "@jupyter-widgets/base",
301
+ "_model_module_version": "1.2.0",
302
+ "_model_name": "LayoutModel",
303
+ "_view_count": null,
304
+ "_view_module": "@jupyter-widgets/base",
305
+ "_view_module_version": "1.2.0",
306
+ "_view_name": "LayoutView",
307
+ "align_content": null,
308
+ "align_items": null,
309
+ "align_self": null,
310
+ "border": null,
311
+ "bottom": null,
312
+ "display": null,
313
+ "flex": null,
314
+ "flex_flow": null,
315
+ "grid_area": null,
316
+ "grid_auto_columns": null,
317
+ "grid_auto_flow": null,
318
+ "grid_auto_rows": null,
319
+ "grid_column": null,
320
+ "grid_gap": null,
321
+ "grid_row": null,
322
+ "grid_template_areas": null,
323
+ "grid_template_columns": null,
324
+ "grid_template_rows": null,
325
+ "height": null,
326
+ "justify_content": null,
327
+ "justify_items": null,
328
+ "left": null,
329
+ "margin": null,
330
+ "max_height": null,
331
+ "max_width": null,
332
+ "min_height": null,
333
+ "min_width": null,
334
+ "object_fit": null,
335
+ "object_position": null,
336
+ "order": null,
337
+ "overflow": null,
338
+ "overflow_x": null,
339
+ "overflow_y": null,
340
+ "padding": null,
341
+ "right": null,
342
+ "top": null,
343
+ "visibility": null,
344
+ "width": null
345
+ }
346
+ },
347
+ "7e2178d696c04d5787e736ace9ab57c0": {
348
+ "model_module": "@jupyter-widgets/controls",
349
+ "model_name": "DescriptionStyleModel",
350
+ "model_module_version": "1.5.0",
351
+ "state": {
352
+ "_model_module": "@jupyter-widgets/controls",
353
+ "_model_module_version": "1.5.0",
354
+ "_model_name": "DescriptionStyleModel",
355
+ "_view_count": null,
356
+ "_view_module": "@jupyter-widgets/base",
357
+ "_view_module_version": "1.2.0",
358
+ "_view_name": "StyleView",
359
+ "description_width": ""
360
+ }
361
+ },
362
+ "3ff80bc2f64948408757caa8715d0603": {
363
+ "model_module": "@jupyter-widgets/controls",
364
+ "model_name": "HBoxModel",
365
+ "model_module_version": "1.5.0",
366
+ "state": {
367
+ "_dom_classes": [],
368
+ "_model_module": "@jupyter-widgets/controls",
369
+ "_model_module_version": "1.5.0",
370
+ "_model_name": "HBoxModel",
371
+ "_view_count": null,
372
+ "_view_module": "@jupyter-widgets/controls",
373
+ "_view_module_version": "1.5.0",
374
+ "_view_name": "HBoxView",
375
+ "box_style": "",
376
+ "children": [
377
+ "IPY_MODEL_12aa8675bca54f05a6deb7ec7a5def7a",
378
+ "IPY_MODEL_31a74feac76f4744a0f34fbc99433831",
379
+ "IPY_MODEL_bd51d97e739a4e78ad28083043f638d8"
380
+ ],
381
+ "layout": "IPY_MODEL_062d36b5d0c043a597eb9b3ebd35f313"
382
+ }
383
+ },
384
+ "12aa8675bca54f05a6deb7ec7a5def7a": {
385
+ "model_module": "@jupyter-widgets/controls",
386
+ "model_name": "HTMLModel",
387
+ "model_module_version": "1.5.0",
388
+ "state": {
389
+ "_dom_classes": [],
390
+ "_model_module": "@jupyter-widgets/controls",
391
+ "_model_module_version": "1.5.0",
392
+ "_model_name": "HTMLModel",
393
+ "_view_count": null,
394
+ "_view_module": "@jupyter-widgets/controls",
395
+ "_view_module_version": "1.5.0",
396
+ "_view_name": "HTMLView",
397
+ "description": "",
398
+ "description_tooltip": null,
399
+ "layout": "IPY_MODEL_2c2223a6ae3e4ff6be96a5f4e2d2d9b6",
400
+ "placeholder": "​",
401
+ "style": "IPY_MODEL_f2c7be27f90b49a3abe51b5e3003c17d",
402
+ "value": "config.json: 100%"
403
+ }
404
+ },
405
+ "31a74feac76f4744a0f34fbc99433831": {
406
+ "model_module": "@jupyter-widgets/controls",
407
+ "model_name": "FloatProgressModel",
408
+ "model_module_version": "1.5.0",
409
+ "state": {
410
+ "_dom_classes": [],
411
+ "_model_module": "@jupyter-widgets/controls",
412
+ "_model_module_version": "1.5.0",
413
+ "_model_name": "FloatProgressModel",
414
+ "_view_count": null,
415
+ "_view_module": "@jupyter-widgets/controls",
416
+ "_view_module_version": "1.5.0",
417
+ "_view_name": "ProgressView",
418
+ "bar_style": "success",
419
+ "description": "",
420
+ "description_tooltip": null,
421
+ "layout": "IPY_MODEL_76d1f15c857640c3b06d98aef478f234",
422
+ "max": 744,
423
+ "min": 0,
424
+ "orientation": "horizontal",
425
+ "style": "IPY_MODEL_d43089f8240c44339c6881355ff0aee3",
426
+ "value": 744
427
+ }
428
+ },
429
+ "bd51d97e739a4e78ad28083043f638d8": {
430
+ "model_module": "@jupyter-widgets/controls",
431
+ "model_name": "HTMLModel",
432
+ "model_module_version": "1.5.0",
433
+ "state": {
434
+ "_dom_classes": [],
435
+ "_model_module": "@jupyter-widgets/controls",
436
+ "_model_module_version": "1.5.0",
437
+ "_model_name": "HTMLModel",
438
+ "_view_count": null,
439
+ "_view_module": "@jupyter-widgets/controls",
440
+ "_view_module_version": "1.5.0",
441
+ "_view_name": "HTMLView",
442
+ "description": "",
443
+ "description_tooltip": null,
444
+ "layout": "IPY_MODEL_a139b85557a942b9b5d32b9d7def3e50",
445
+ "placeholder": "​",
446
+ "style": "IPY_MODEL_92043bfce97e4629bf9e4b268aa88c11",
447
+ "value": " 744/744 [00:00<00:00, 93.7kB/s]"
448
+ }
449
+ },
450
+ "062d36b5d0c043a597eb9b3ebd35f313": {
451
+ "model_module": "@jupyter-widgets/base",
452
+ "model_name": "LayoutModel",
453
+ "model_module_version": "1.2.0",
454
+ "state": {
455
+ "_model_module": "@jupyter-widgets/base",
456
+ "_model_module_version": "1.2.0",
457
+ "_model_name": "LayoutModel",
458
+ "_view_count": null,
459
+ "_view_module": "@jupyter-widgets/base",
460
+ "_view_module_version": "1.2.0",
461
+ "_view_name": "LayoutView",
462
+ "align_content": null,
463
+ "align_items": null,
464
+ "align_self": null,
465
+ "border": null,
466
+ "bottom": null,
467
+ "display": null,
468
+ "flex": null,
469
+ "flex_flow": null,
470
+ "grid_area": null,
471
+ "grid_auto_columns": null,
472
+ "grid_auto_flow": null,
473
+ "grid_auto_rows": null,
474
+ "grid_column": null,
475
+ "grid_gap": null,
476
+ "grid_row": null,
477
+ "grid_template_areas": null,
478
+ "grid_template_columns": null,
479
+ "grid_template_rows": null,
480
+ "height": null,
481
+ "justify_content": null,
482
+ "justify_items": null,
483
+ "left": null,
484
+ "margin": null,
485
+ "max_height": null,
486
+ "max_width": null,
487
+ "min_height": null,
488
+ "min_width": null,
489
+ "object_fit": null,
490
+ "object_position": null,
491
+ "order": null,
492
+ "overflow": null,
493
+ "overflow_x": null,
494
+ "overflow_y": null,
495
+ "padding": null,
496
+ "right": null,
497
+ "top": null,
498
+ "visibility": null,
499
+ "width": null
500
+ }
501
+ },
502
+ "2c2223a6ae3e4ff6be96a5f4e2d2d9b6": {
503
+ "model_module": "@jupyter-widgets/base",
504
+ "model_name": "LayoutModel",
505
+ "model_module_version": "1.2.0",
506
+ "state": {
507
+ "_model_module": "@jupyter-widgets/base",
508
+ "_model_module_version": "1.2.0",
509
+ "_model_name": "LayoutModel",
510
+ "_view_count": null,
511
+ "_view_module": "@jupyter-widgets/base",
512
+ "_view_module_version": "1.2.0",
513
+ "_view_name": "LayoutView",
514
+ "align_content": null,
515
+ "align_items": null,
516
+ "align_self": null,
517
+ "border": null,
518
+ "bottom": null,
519
+ "display": null,
520
+ "flex": null,
521
+ "flex_flow": null,
522
+ "grid_area": null,
523
+ "grid_auto_columns": null,
524
+ "grid_auto_flow": null,
525
+ "grid_auto_rows": null,
526
+ "grid_column": null,
527
+ "grid_gap": null,
528
+ "grid_row": null,
529
+ "grid_template_areas": null,
530
+ "grid_template_columns": null,
531
+ "grid_template_rows": null,
532
+ "height": null,
533
+ "justify_content": null,
534
+ "justify_items": null,
535
+ "left": null,
536
+ "margin": null,
537
+ "max_height": null,
538
+ "max_width": null,
539
+ "min_height": null,
540
+ "min_width": null,
541
+ "object_fit": null,
542
+ "object_position": null,
543
+ "order": null,
544
+ "overflow": null,
545
+ "overflow_x": null,
546
+ "overflow_y": null,
547
+ "padding": null,
548
+ "right": null,
549
+ "top": null,
550
+ "visibility": null,
551
+ "width": null
552
+ }
553
+ },
554
+ "f2c7be27f90b49a3abe51b5e3003c17d": {
555
+ "model_module": "@jupyter-widgets/controls",
556
+ "model_name": "DescriptionStyleModel",
557
+ "model_module_version": "1.5.0",
558
+ "state": {
559
+ "_model_module": "@jupyter-widgets/controls",
560
+ "_model_module_version": "1.5.0",
561
+ "_model_name": "DescriptionStyleModel",
562
+ "_view_count": null,
563
+ "_view_module": "@jupyter-widgets/base",
564
+ "_view_module_version": "1.2.0",
565
+ "_view_name": "StyleView",
566
+ "description_width": ""
567
+ }
568
+ },
569
+ "76d1f15c857640c3b06d98aef478f234": {
570
+ "model_module": "@jupyter-widgets/base",
571
+ "model_name": "LayoutModel",
572
+ "model_module_version": "1.2.0",
573
+ "state": {
574
+ "_model_module": "@jupyter-widgets/base",
575
+ "_model_module_version": "1.2.0",
576
+ "_model_name": "LayoutModel",
577
+ "_view_count": null,
578
+ "_view_module": "@jupyter-widgets/base",
579
+ "_view_module_version": "1.2.0",
580
+ "_view_name": "LayoutView",
581
+ "align_content": null,
582
+ "align_items": null,
583
+ "align_self": null,
584
+ "border": null,
585
+ "bottom": null,
586
+ "display": null,
587
+ "flex": null,
588
+ "flex_flow": null,
589
+ "grid_area": null,
590
+ "grid_auto_columns": null,
591
+ "grid_auto_flow": null,
592
+ "grid_auto_rows": null,
593
+ "grid_column": null,
594
+ "grid_gap": null,
595
+ "grid_row": null,
596
+ "grid_template_areas": null,
597
+ "grid_template_columns": null,
598
+ "grid_template_rows": null,
599
+ "height": null,
600
+ "justify_content": null,
601
+ "justify_items": null,
602
+ "left": null,
603
+ "margin": null,
604
+ "max_height": null,
605
+ "max_width": null,
606
+ "min_height": null,
607
+ "min_width": null,
608
+ "object_fit": null,
609
+ "object_position": null,
610
+ "order": null,
611
+ "overflow": null,
612
+ "overflow_x": null,
613
+ "overflow_y": null,
614
+ "padding": null,
615
+ "right": null,
616
+ "top": null,
617
+ "visibility": null,
618
+ "width": null
619
+ }
620
+ },
621
+ "d43089f8240c44339c6881355ff0aee3": {
622
+ "model_module": "@jupyter-widgets/controls",
623
+ "model_name": "ProgressStyleModel",
624
+ "model_module_version": "1.5.0",
625
+ "state": {
626
+ "_model_module": "@jupyter-widgets/controls",
627
+ "_model_module_version": "1.5.0",
628
+ "_model_name": "ProgressStyleModel",
629
+ "_view_count": null,
630
+ "_view_module": "@jupyter-widgets/base",
631
+ "_view_module_version": "1.2.0",
632
+ "_view_name": "StyleView",
633
+ "bar_color": null,
634
+ "description_width": ""
635
+ }
636
+ },
637
+ "a139b85557a942b9b5d32b9d7def3e50": {
638
+ "model_module": "@jupyter-widgets/base",
639
+ "model_name": "LayoutModel",
640
+ "model_module_version": "1.2.0",
641
+ "state": {
642
+ "_model_module": "@jupyter-widgets/base",
643
+ "_model_module_version": "1.2.0",
644
+ "_model_name": "LayoutModel",
645
+ "_view_count": null,
646
+ "_view_module": "@jupyter-widgets/base",
647
+ "_view_module_version": "1.2.0",
648
+ "_view_name": "LayoutView",
649
+ "align_content": null,
650
+ "align_items": null,
651
+ "align_self": null,
652
+ "border": null,
653
+ "bottom": null,
654
+ "display": null,
655
+ "flex": null,
656
+ "flex_flow": null,
657
+ "grid_area": null,
658
+ "grid_auto_columns": null,
659
+ "grid_auto_flow": null,
660
+ "grid_auto_rows": null,
661
+ "grid_column": null,
662
+ "grid_gap": null,
663
+ "grid_row": null,
664
+ "grid_template_areas": null,
665
+ "grid_template_columns": null,
666
+ "grid_template_rows": null,
667
+ "height": null,
668
+ "justify_content": null,
669
+ "justify_items": null,
670
+ "left": null,
671
+ "margin": null,
672
+ "max_height": null,
673
+ "max_width": null,
674
+ "min_height": null,
675
+ "min_width": null,
676
+ "object_fit": null,
677
+ "object_position": null,
678
+ "order": null,
679
+ "overflow": null,
680
+ "overflow_x": null,
681
+ "overflow_y": null,
682
+ "padding": null,
683
+ "right": null,
684
+ "top": null,
685
+ "visibility": null,
686
+ "width": null
687
+ }
688
+ },
689
+ "92043bfce97e4629bf9e4b268aa88c11": {
690
+ "model_module": "@jupyter-widgets/controls",
691
+ "model_name": "DescriptionStyleModel",
692
+ "model_module_version": "1.5.0",
693
+ "state": {
694
+ "_model_module": "@jupyter-widgets/controls",
695
+ "_model_module_version": "1.5.0",
696
+ "_model_name": "DescriptionStyleModel",
697
+ "_view_count": null,
698
+ "_view_module": "@jupyter-widgets/base",
699
+ "_view_module_version": "1.2.0",
700
+ "_view_name": "StyleView",
701
+ "description_width": ""
702
+ }
703
+ },
704
+ "f20b3989658642528f4ed91666320097": {
705
+ "model_module": "@jupyter-widgets/controls",
706
+ "model_name": "HBoxModel",
707
+ "model_module_version": "1.5.0",
708
+ "state": {
709
+ "_dom_classes": [],
710
+ "_model_module": "@jupyter-widgets/controls",
711
+ "_model_module_version": "1.5.0",
712
+ "_model_name": "HBoxModel",
713
+ "_view_count": null,
714
+ "_view_module": "@jupyter-widgets/controls",
715
+ "_view_module_version": "1.5.0",
716
+ "_view_name": "HBoxView",
717
+ "box_style": "",
718
+ "children": [
719
+ "IPY_MODEL_3ee9921a635d44ec9b248e2155b5b243",
720
+ "IPY_MODEL_caf0790dbf2544378cb04aa8eb3098c3",
721
+ "IPY_MODEL_3ff0fc5ce62a44b9950dd8575d90bd21"
722
+ ],
723
+ "layout": "IPY_MODEL_77cdafc6dae44107a43a46ae19ed390a"
724
+ }
725
+ },
726
+ "3ee9921a635d44ec9b248e2155b5b243": {
727
+ "model_module": "@jupyter-widgets/controls",
728
+ "model_name": "HTMLModel",
729
+ "model_module_version": "1.5.0",
730
+ "state": {
731
+ "_dom_classes": [],
732
+ "_model_module": "@jupyter-widgets/controls",
733
+ "_model_module_version": "1.5.0",
734
+ "_model_name": "HTMLModel",
735
+ "_view_count": null,
736
+ "_view_module": "@jupyter-widgets/controls",
737
+ "_view_module_version": "1.5.0",
738
+ "_view_name": "HTMLView",
739
+ "description": "",
740
+ "description_tooltip": null,
741
+ "layout": "IPY_MODEL_65d8b73e3bdd46fca8a42b67739e27f9",
742
+ "placeholder": "​",
743
+ "style": "IPY_MODEL_b566321171044b0eb02ea3bd8c0472df",
744
+ "value": "model.safetensors: 100%"
745
+ }
746
+ },
747
+ "caf0790dbf2544378cb04aa8eb3098c3": {
748
+ "model_module": "@jupyter-widgets/controls",
749
+ "model_name": "FloatProgressModel",
750
+ "model_module_version": "1.5.0",
751
+ "state": {
752
+ "_dom_classes": [],
753
+ "_model_module": "@jupyter-widgets/controls",
754
+ "_model_module_version": "1.5.0",
755
+ "_model_name": "FloatProgressModel",
756
+ "_view_count": null,
757
+ "_view_module": "@jupyter-widgets/controls",
758
+ "_view_module_version": "1.5.0",
759
+ "_view_name": "ProgressView",
760
+ "bar_style": "success",
761
+ "description": "",
762
+ "description_tooltip": null,
763
+ "layout": "IPY_MODEL_62535e046f794a28b4002c3f34fe7ff7",
764
+ "max": 3362432800,
765
+ "min": 0,
766
+ "orientation": "horizontal",
767
+ "style": "IPY_MODEL_663aa65fdb4e4349b2815b6bafce4dcd",
768
+ "value": 3362432800
769
+ }
770
+ },
771
+ "3ff0fc5ce62a44b9950dd8575d90bd21": {
772
+ "model_module": "@jupyter-widgets/controls",
773
+ "model_name": "HTMLModel",
774
+ "model_module_version": "1.5.0",
775
+ "state": {
776
+ "_dom_classes": [],
777
+ "_model_module": "@jupyter-widgets/controls",
778
+ "_model_module_version": "1.5.0",
779
+ "_model_name": "HTMLModel",
780
+ "_view_count": null,
781
+ "_view_module": "@jupyter-widgets/controls",
782
+ "_view_module_version": "1.5.0",
783
+ "_view_name": "HTMLView",
784
+ "description": "",
785
+ "description_tooltip": null,
786
+ "layout": "IPY_MODEL_8410c9d15bca4c9f8b3aab2b7d327211",
787
+ "placeholder": "​",
788
+ "style": "IPY_MODEL_fb359d0651a74fe790aaace9a5d0e329",
789
+ "value": " 3.36G/3.36G [00:18<00:00, 296MB/s]"
790
+ }
791
+ },
792
+ "77cdafc6dae44107a43a46ae19ed390a": {
793
+ "model_module": "@jupyter-widgets/base",
794
+ "model_name": "LayoutModel",
795
+ "model_module_version": "1.2.0",
796
+ "state": {
797
+ "_model_module": "@jupyter-widgets/base",
798
+ "_model_module_version": "1.2.0",
799
+ "_model_name": "LayoutModel",
800
+ "_view_count": null,
801
+ "_view_module": "@jupyter-widgets/base",
802
+ "_view_module_version": "1.2.0",
803
+ "_view_name": "LayoutView",
804
+ "align_content": null,
805
+ "align_items": null,
806
+ "align_self": null,
807
+ "border": null,
808
+ "bottom": null,
809
+ "display": null,
810
+ "flex": null,
811
+ "flex_flow": null,
812
+ "grid_area": null,
813
+ "grid_auto_columns": null,
814
+ "grid_auto_flow": null,
815
+ "grid_auto_rows": null,
816
+ "grid_column": null,
817
+ "grid_gap": null,
818
+ "grid_row": null,
819
+ "grid_template_areas": null,
820
+ "grid_template_columns": null,
821
+ "grid_template_rows": null,
822
+ "height": null,
823
+ "justify_content": null,
824
+ "justify_items": null,
825
+ "left": null,
826
+ "margin": null,
827
+ "max_height": null,
828
+ "max_width": null,
829
+ "min_height": null,
830
+ "min_width": null,
831
+ "object_fit": null,
832
+ "object_position": null,
833
+ "order": null,
834
+ "overflow": null,
835
+ "overflow_x": null,
836
+ "overflow_y": null,
837
+ "padding": null,
838
+ "right": null,
839
+ "top": null,
840
+ "visibility": null,
841
+ "width": null
842
+ }
843
+ },
844
+ "65d8b73e3bdd46fca8a42b67739e27f9": {
845
+ "model_module": "@jupyter-widgets/base",
846
+ "model_name": "LayoutModel",
847
+ "model_module_version": "1.2.0",
848
+ "state": {
849
+ "_model_module": "@jupyter-widgets/base",
850
+ "_model_module_version": "1.2.0",
851
+ "_model_name": "LayoutModel",
852
+ "_view_count": null,
853
+ "_view_module": "@jupyter-widgets/base",
854
+ "_view_module_version": "1.2.0",
855
+ "_view_name": "LayoutView",
856
+ "align_content": null,
857
+ "align_items": null,
858
+ "align_self": null,
859
+ "border": null,
860
+ "bottom": null,
861
+ "display": null,
862
+ "flex": null,
863
+ "flex_flow": null,
864
+ "grid_area": null,
865
+ "grid_auto_columns": null,
866
+ "grid_auto_flow": null,
867
+ "grid_auto_rows": null,
868
+ "grid_column": null,
869
+ "grid_gap": null,
870
+ "grid_row": null,
871
+ "grid_template_areas": null,
872
+ "grid_template_columns": null,
873
+ "grid_template_rows": null,
874
+ "height": null,
875
+ "justify_content": null,
876
+ "justify_items": null,
877
+ "left": null,
878
+ "margin": null,
879
+ "max_height": null,
880
+ "max_width": null,
881
+ "min_height": null,
882
+ "min_width": null,
883
+ "object_fit": null,
884
+ "object_position": null,
885
+ "order": null,
886
+ "overflow": null,
887
+ "overflow_x": null,
888
+ "overflow_y": null,
889
+ "padding": null,
890
+ "right": null,
891
+ "top": null,
892
+ "visibility": null,
893
+ "width": null
894
+ }
895
+ },
896
+ "b566321171044b0eb02ea3bd8c0472df": {
897
+ "model_module": "@jupyter-widgets/controls",
898
+ "model_name": "DescriptionStyleModel",
899
+ "model_module_version": "1.5.0",
900
+ "state": {
901
+ "_model_module": "@jupyter-widgets/controls",
902
+ "_model_module_version": "1.5.0",
903
+ "_model_name": "DescriptionStyleModel",
904
+ "_view_count": null,
905
+ "_view_module": "@jupyter-widgets/base",
906
+ "_view_module_version": "1.2.0",
907
+ "_view_name": "StyleView",
908
+ "description_width": ""
909
+ }
910
+ },
911
+ "62535e046f794a28b4002c3f34fe7ff7": {
912
+ "model_module": "@jupyter-widgets/base",
913
+ "model_name": "LayoutModel",
914
+ "model_module_version": "1.2.0",
915
+ "state": {
916
+ "_model_module": "@jupyter-widgets/base",
917
+ "_model_module_version": "1.2.0",
918
+ "_model_name": "LayoutModel",
919
+ "_view_count": null,
920
+ "_view_module": "@jupyter-widgets/base",
921
+ "_view_module_version": "1.2.0",
922
+ "_view_name": "LayoutView",
923
+ "align_content": null,
924
+ "align_items": null,
925
+ "align_self": null,
926
+ "border": null,
927
+ "bottom": null,
928
+ "display": null,
929
+ "flex": null,
930
+ "flex_flow": null,
931
+ "grid_area": null,
932
+ "grid_auto_columns": null,
933
+ "grid_auto_flow": null,
934
+ "grid_auto_rows": null,
935
+ "grid_column": null,
936
+ "grid_gap": null,
937
+ "grid_row": null,
938
+ "grid_template_areas": null,
939
+ "grid_template_columns": null,
940
+ "grid_template_rows": null,
941
+ "height": null,
942
+ "justify_content": null,
943
+ "justify_items": null,
944
+ "left": null,
945
+ "margin": null,
946
+ "max_height": null,
947
+ "max_width": null,
948
+ "min_height": null,
949
+ "min_width": null,
950
+ "object_fit": null,
951
+ "object_position": null,
952
+ "order": null,
953
+ "overflow": null,
954
+ "overflow_x": null,
955
+ "overflow_y": null,
956
+ "padding": null,
957
+ "right": null,
958
+ "top": null,
959
+ "visibility": null,
960
+ "width": null
961
+ }
962
+ },
963
+ "663aa65fdb4e4349b2815b6bafce4dcd": {
964
+ "model_module": "@jupyter-widgets/controls",
965
+ "model_name": "ProgressStyleModel",
966
+ "model_module_version": "1.5.0",
967
+ "state": {
968
+ "_model_module": "@jupyter-widgets/controls",
969
+ "_model_module_version": "1.5.0",
970
+ "_model_name": "ProgressStyleModel",
971
+ "_view_count": null,
972
+ "_view_module": "@jupyter-widgets/base",
973
+ "_view_module_version": "1.2.0",
974
+ "_view_name": "StyleView",
975
+ "bar_color": null,
976
+ "description_width": ""
977
+ }
978
+ },
979
+ "8410c9d15bca4c9f8b3aab2b7d327211": {
980
+ "model_module": "@jupyter-widgets/base",
981
+ "model_name": "LayoutModel",
982
+ "model_module_version": "1.2.0",
983
+ "state": {
984
+ "_model_module": "@jupyter-widgets/base",
985
+ "_model_module_version": "1.2.0",
986
+ "_model_name": "LayoutModel",
987
+ "_view_count": null,
988
+ "_view_module": "@jupyter-widgets/base",
989
+ "_view_module_version": "1.2.0",
990
+ "_view_name": "LayoutView",
991
+ "align_content": null,
992
+ "align_items": null,
993
+ "align_self": null,
994
+ "border": null,
995
+ "bottom": null,
996
+ "display": null,
997
+ "flex": null,
998
+ "flex_flow": null,
999
+ "grid_area": null,
1000
+ "grid_auto_columns": null,
1001
+ "grid_auto_flow": null,
1002
+ "grid_auto_rows": null,
1003
+ "grid_column": null,
1004
+ "grid_gap": null,
1005
+ "grid_row": null,
1006
+ "grid_template_areas": null,
1007
+ "grid_template_columns": null,
1008
+ "grid_template_rows": null,
1009
+ "height": null,
1010
+ "justify_content": null,
1011
+ "justify_items": null,
1012
+ "left": null,
1013
+ "margin": null,
1014
+ "max_height": null,
1015
+ "max_width": null,
1016
+ "min_height": null,
1017
+ "min_width": null,
1018
+ "object_fit": null,
1019
+ "object_position": null,
1020
+ "order": null,
1021
+ "overflow": null,
1022
+ "overflow_x": null,
1023
+ "overflow_y": null,
1024
+ "padding": null,
1025
+ "right": null,
1026
+ "top": null,
1027
+ "visibility": null,
1028
+ "width": null
1029
+ }
1030
+ },
1031
+ "fb359d0651a74fe790aaace9a5d0e329": {
1032
+ "model_module": "@jupyter-widgets/controls",
1033
+ "model_name": "DescriptionStyleModel",
1034
+ "model_module_version": "1.5.0",
1035
+ "state": {
1036
+ "_model_module": "@jupyter-widgets/controls",
1037
+ "_model_module_version": "1.5.0",
1038
+ "_model_name": "DescriptionStyleModel",
1039
+ "_view_count": null,
1040
+ "_view_module": "@jupyter-widgets/base",
1041
+ "_view_module_version": "1.2.0",
1042
+ "_view_name": "StyleView",
1043
+ "description_width": ""
1044
+ }
1045
+ }
1046
+ }
1047
+ }
1048
+ },
1049
+ "cells": [
1050
+ {
1051
+ "cell_type": "markdown",
1052
+ "source": [
1053
+ "## DINOv3 Fine-tuning for Image Classification"
1054
+ ],
1055
+ "metadata": {
1056
+ "id": "BCTUDjwiYn6T"
1057
+ }
1058
+ },
1059
+ {
1060
+ "cell_type": "code",
1061
+ "source": [
1062
+ "!pip install -q trackio git+https://github.com/huggingface/transformers.git"
1063
+ ],
1064
+ "metadata": {
1065
+ "colab": {
1066
+ "base_uri": "https://localhost:8080/"
1067
+ },
1068
+ "id": "Aa1zPoxo_JBf",
1069
+ "outputId": "958700f5-189c-42ec-dfc5-9852e5efe368"
1070
+ },
1071
+ "execution_count": 1,
1072
+ "outputs": [
1073
+ {
1074
+ "output_type": "stream",
1075
+ "name": "stdout",
1076
+ "text": [
1077
+ " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
1078
+ " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
1079
+ " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
1080
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m838.5/838.5 kB\u001b[0m \u001b[31m41.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1081
+ "\u001b[?25h Building wheel for transformers (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n"
1082
+ ]
1083
+ }
1084
+ ]
1085
+ },
1086
+ {
1087
+ "cell_type": "markdown",
1088
+ "source": [
1089
+ "## Dataset"
1090
+ ],
1091
+ "metadata": {
1092
+ "id": "5AJ3YVCE8S9Y"
1093
+ }
1094
+ },
1095
+ {
1096
+ "cell_type": "markdown",
1097
+ "source": [
1098
+ "We will do a very small run on food101 dataset."
1099
+ ],
1100
+ "metadata": {
1101
+ "id": "s_Aabbb6VBZt"
1102
+ }
1103
+ },
1104
+ {
1105
+ "cell_type": "code",
1106
+ "source": [
1107
+ "from datasets import load_dataset\n",
1108
+ "\n",
1109
+ "ds = load_dataset(\"ethz/food101\")\n",
1110
+ "\n",
1111
+ "train_ds = ds[\"train\"]\n",
1112
+ "train_ds = train_ds.shuffle().train_test_split(test_size=0.9)[\"train\"]\n",
1113
+ "val_ds = ds[\"validation\"].shuffle().train_test_split(test_size=0.9)[\"train\"]"
1114
+ ],
1115
+ "metadata": {
1116
+ "id": "Cxzbngbq4K31"
1117
+ },
1118
+ "execution_count": 7,
1119
+ "outputs": []
1120
+ },
1121
+ {
1122
+ "cell_type": "code",
1123
+ "source": [
1124
+ "train_ds"
1125
+ ],
1126
+ "metadata": {
1127
+ "colab": {
1128
+ "base_uri": "https://localhost:8080/"
1129
+ },
1130
+ "id": "g1wl86sp8L6C",
1131
+ "outputId": "1b42f43f-df62-4eba-f469-54cabd232cf9"
1132
+ },
1133
+ "execution_count": 8,
1134
+ "outputs": [
1135
+ {
1136
+ "output_type": "execute_result",
1137
+ "data": {
1138
+ "text/plain": [
1139
+ "Dataset({\n",
1140
+ " features: ['image', 'label'],\n",
1141
+ " num_rows: 7575\n",
1142
+ "})"
1143
+ ]
1144
+ },
1145
+ "metadata": {},
1146
+ "execution_count": 8
1147
+ }
1148
+ ]
1149
+ },
1150
+ {
1151
+ "cell_type": "code",
1152
+ "source": [
1153
+ "val_ds"
1154
+ ],
1155
+ "metadata": {
1156
+ "colab": {
1157
+ "base_uri": "https://localhost:8080/"
1158
+ },
1159
+ "id": "Tq5OiKxvVj9k",
1160
+ "outputId": "391489ba-d95f-498a-b4bb-f959e19686b0"
1161
+ },
1162
+ "execution_count": 9,
1163
+ "outputs": [
1164
+ {
1165
+ "output_type": "execute_result",
1166
+ "data": {
1167
+ "text/plain": [
1168
+ "Dataset({\n",
1169
+ " features: ['image', 'label'],\n",
1170
+ " num_rows: 2525\n",
1171
+ "})"
1172
+ ]
1173
+ },
1174
+ "metadata": {},
1175
+ "execution_count": 9
1176
+ }
1177
+ ]
1178
+ },
1179
+ {
1180
+ "cell_type": "code",
1181
+ "source": [
1182
+ "num_classes = train_ds.features[\"label\"].num_classes\n",
1183
+ "id2label = {i: name for i, name in enumerate(train_ds.features[\"label\"].names)}\n",
1184
+ "label2id = {v: k for k, v in id2label.items()}\n",
1185
+ "print(f\"Classes: {num_classes}\")"
1186
+ ],
1187
+ "metadata": {
1188
+ "colab": {
1189
+ "base_uri": "https://localhost:8080/"
1190
+ },
1191
+ "id": "1JcvDPFK8Scd",
1192
+ "outputId": "5c920e23-e96b-4c62-bf3a-7db183c97f48"
1193
+ },
1194
+ "execution_count": 10,
1195
+ "outputs": [
1196
+ {
1197
+ "output_type": "stream",
1198
+ "name": "stdout",
1199
+ "text": [
1200
+ "Classes: 101\n"
1201
+ ]
1202
+ }
1203
+ ]
1204
+ },
1205
+ {
1206
+ "cell_type": "markdown",
1207
+ "source": [
1208
+ "## Load Model\n",
1209
+ "\n",
1210
+ "This model doesn't come with a head, so we need to write the headed model class."
1211
+ ],
1212
+ "metadata": {
1213
+ "id": "_69A3AmO81c8"
1214
+ }
1215
+ },
1216
+ {
1217
+ "cell_type": "code",
1218
+ "source": [
1219
+ "import torch.nn as nn\n",
1220
+ "import torch\n",
1221
+ "from transformers import AutoImageProcessor, AutoModel, get_cosine_schedule_with_warmup\n",
1222
+ "\n",
1223
+ "MODEL_NAME = \"facebook/dinov3-vith16plus-pretrain-lvd1689m\"\n",
1224
+ "\n",
1225
+ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
1226
+ "\n",
1227
+ "\n",
1228
+ "image_processor = AutoImageProcessor.from_pretrained(MODEL_NAME)\n",
1229
+ "backbone = AutoModel.from_pretrained(MODEL_NAME)\n",
1230
+ "\n",
1231
+ "hidden_size = getattr(backbone.config, \"hidden_size\", None)\n",
1232
+ "\n",
1233
+ "class DinoV3Linear(nn.Module):\n",
1234
+ " def __init__(self, backbone: AutoModel, hidden_size: int, num_classes: int, freeze_backbone: bool = True):\n",
1235
+ " super().__init__()\n",
1236
+ " self.backbone = backbone\n",
1237
+ " if freeze_backbone:\n",
1238
+ " for p in self.backbone.parameters():\n",
1239
+ " p.requires_grad = False\n",
1240
+ " self.backbone.eval()\n",
1241
+ "\n",
1242
+ " self.head = nn.Linear(hidden_size, num_classes)\n",
1243
+ "\n",
1244
+ " def forward(self, pixel_values):\n",
1245
+ " outputs = self.backbone(pixel_values=pixel_values)\n",
1246
+ " last_hidden = outputs.last_hidden_state\n",
1247
+ " cls = last_hidden[:, 0]\n",
1248
+ " logits = self.head(cls)\n",
1249
+ " return logits\n",
1250
+ "\n",
1251
+ "model = DinoV3Linear(backbone, hidden_size, num_classes, freeze_backbone=True).to(device) # we only train the head"
1252
+ ],
1253
+ "metadata": {
1254
+ "colab": {
1255
+ "base_uri": "https://localhost:8080/",
1256
+ "height": 113,
1257
+ "referenced_widgets": [
1258
+ "32138245d41348928cc5b5834b07cb7e",
1259
+ "df6de04fdb204d348767dd0b2d0e88f7",
1260
+ "63a3800d62dd41d6b4a3f643a8930d95",
1261
+ "49d67bd205184874a5cee04d318d91fe",
1262
+ "f00ace964f96471b9eb839cce48ce378",
1263
+ "3ad0ac8def244930a3aff41d68a88a65",
1264
+ "7464841c193d492685bb929b1c0d230c",
1265
+ "5c16553a2ff34a37a2cb62b4a4c42a6f",
1266
+ "34be83ddb4bf43e58cadbcbac5a606b7",
1267
+ "0ce7bd7e52074f29b446ef2d4dd0921a",
1268
+ "7e2178d696c04d5787e736ace9ab57c0",
1269
+ "3ff80bc2f64948408757caa8715d0603",
1270
+ "12aa8675bca54f05a6deb7ec7a5def7a",
1271
+ "31a74feac76f4744a0f34fbc99433831",
1272
+ "bd51d97e739a4e78ad28083043f638d8",
1273
+ "062d36b5d0c043a597eb9b3ebd35f313",
1274
+ "2c2223a6ae3e4ff6be96a5f4e2d2d9b6",
1275
+ "f2c7be27f90b49a3abe51b5e3003c17d",
1276
+ "76d1f15c857640c3b06d98aef478f234",
1277
+ "d43089f8240c44339c6881355ff0aee3",
1278
+ "a139b85557a942b9b5d32b9d7def3e50",
1279
+ "92043bfce97e4629bf9e4b268aa88c11",
1280
+ "f20b3989658642528f4ed91666320097",
1281
+ "3ee9921a635d44ec9b248e2155b5b243",
1282
+ "caf0790dbf2544378cb04aa8eb3098c3",
1283
+ "3ff0fc5ce62a44b9950dd8575d90bd21",
1284
+ "77cdafc6dae44107a43a46ae19ed390a",
1285
+ "65d8b73e3bdd46fca8a42b67739e27f9",
1286
+ "b566321171044b0eb02ea3bd8c0472df",
1287
+ "62535e046f794a28b4002c3f34fe7ff7",
1288
+ "663aa65fdb4e4349b2815b6bafce4dcd",
1289
+ "8410c9d15bca4c9f8b3aab2b7d327211",
1290
+ "fb359d0651a74fe790aaace9a5d0e329"
1291
+ ]
1292
+ },
1293
+ "id": "_oqXAu_y81H4",
1294
+ "outputId": "7c4a4f6f-2301-4a43-eecb-50f1adb004b9"
1295
+ },
1296
+ "execution_count": 11,
1297
+ "outputs": [
1298
+ {
1299
+ "output_type": "display_data",
1300
+ "data": {
1301
+ "text/plain": [
1302
+ "preprocessor_config.json: 0%| | 0.00/585 [00:00<?, ?B/s]"
1303
+ ],
1304
+ "application/vnd.jupyter.widget-view+json": {
1305
+ "version_major": 2,
1306
+ "version_minor": 0,
1307
+ "model_id": "32138245d41348928cc5b5834b07cb7e"
1308
+ }
1309
+ },
1310
+ "metadata": {}
1311
+ },
1312
+ {
1313
+ "output_type": "display_data",
1314
+ "data": {
1315
+ "text/plain": [
1316
+ "config.json: 0%| | 0.00/744 [00:00<?, ?B/s]"
1317
+ ],
1318
+ "application/vnd.jupyter.widget-view+json": {
1319
+ "version_major": 2,
1320
+ "version_minor": 0,
1321
+ "model_id": "3ff80bc2f64948408757caa8715d0603"
1322
+ }
1323
+ },
1324
+ "metadata": {}
1325
+ },
1326
+ {
1327
+ "output_type": "display_data",
1328
+ "data": {
1329
+ "text/plain": [
1330
+ "model.safetensors: 0%| | 0.00/3.36G [00:00<?, ?B/s]"
1331
+ ],
1332
+ "application/vnd.jupyter.widget-view+json": {
1333
+ "version_major": 2,
1334
+ "version_minor": 0,
1335
+ "model_id": "f20b3989658642528f4ed91666320097"
1336
+ }
1337
+ },
1338
+ "metadata": {}
1339
+ }
1340
+ ]
1341
+ },
1342
+ {
1343
+ "cell_type": "markdown",
1344
+ "source": [
1345
+ "Write the data collator to batch inputs and dataloaders for training."
1346
+ ],
1347
+ "metadata": {
1348
+ "id": "IfC3TFbw9SlZ"
1349
+ }
1350
+ },
1351
+ {
1352
+ "cell_type": "code",
1353
+ "source": [
1354
+ "from dataclasses import dataclass\n",
1355
+ "from PIL import Image\n",
1356
+ "import numpy as np\n",
1357
+ "import torch\n",
1358
+ "from transformers import AutoImageProcessor\n",
1359
+ "\n",
1360
+ "@dataclass\n",
1361
+ "class Collator:\n",
1362
+ " processor: AutoImageProcessor\n",
1363
+ "\n",
1364
+ " def __call__(self, batch):\n",
1365
+ " raw_images = [x[\"image\"] for x in batch]\n",
1366
+ " labels = torch.tensor([x[\"label\"] for x in batch], dtype=torch.long)\n",
1367
+ "\n",
1368
+ " rgb_images = []\n",
1369
+ " # there's grayscale images in the dataset\n",
1370
+ " for im in raw_images:\n",
1371
+ " if isinstance(im, Image.Image):\n",
1372
+ " rgb_images.append(im.convert(\"RGB\"))\n",
1373
+ " continue\n",
1374
+ "\n",
1375
+ " inputs = self.processor(images=rgb_images, return_tensors=\"pt\")\n",
1376
+ " return {\"pixel_values\": inputs[\"pixel_values\"], \"labels\": labels}\n",
1377
+ "\n",
1378
+ "collate_fn = Collator(image_processor)"
1379
+ ],
1380
+ "metadata": {
1381
+ "id": "Wlo3_8qE9SVR"
1382
+ },
1383
+ "execution_count": 12,
1384
+ "outputs": []
1385
+ },
1386
+ {
1387
+ "cell_type": "code",
1388
+ "source": [
1389
+ "from torch.utils.data import DataLoader\n",
1390
+ "import os\n",
1391
+ "\n",
1392
+ "BATCH_SIZE = 8\n",
1393
+ "NUM_WORKERS = min(8, os.cpu_count() or 2)\n",
1394
+ "\n",
1395
+ "train_loader = DataLoader(\n",
1396
+ " train_ds,\n",
1397
+ " batch_size=BATCH_SIZE,\n",
1398
+ " shuffle=True,\n",
1399
+ " num_workers=NUM_WORKERS,\n",
1400
+ " pin_memory=True,\n",
1401
+ " collate_fn=collate_fn,\n",
1402
+ ")\n",
1403
+ "val_loader = DataLoader(\n",
1404
+ " val_ds,\n",
1405
+ " batch_size=BATCH_SIZE,\n",
1406
+ " shuffle=False,\n",
1407
+ " num_workers=NUM_WORKERS,\n",
1408
+ " pin_memory=True,\n",
1409
+ " collate_fn=collate_fn,\n",
1410
+ ")"
1411
+ ],
1412
+ "metadata": {
1413
+ "id": "Nou-Ct_e9zV5"
1414
+ },
1415
+ "execution_count": 13,
1416
+ "outputs": []
1417
+ },
1418
+ {
1419
+ "cell_type": "markdown",
1420
+ "source": [
1421
+ "## Training"
1422
+ ],
1423
+ "metadata": {
1424
+ "id": "RblgS11W-Wuo"
1425
+ }
1426
+ },
1427
+ {
1428
+ "cell_type": "markdown",
1429
+ "source": [
1430
+ "Find config below."
1431
+ ],
1432
+ "metadata": {
1433
+ "id": "25sCxjwG_tPo"
1434
+ }
1435
+ },
1436
+ {
1437
+ "cell_type": "code",
1438
+ "source": [
1439
+ "import math\n",
1440
+ "import random\n",
1441
+ "from typing import List, Dict, Any\n",
1442
+ "\n",
1443
+ "\n",
1444
+ "EPOCHS = 5\n",
1445
+ "LR = 5e-4\n",
1446
+ "WEIGHT_DECAY = 1e-4\n",
1447
+ "WARMUP_RATIO = 0.05\n",
1448
+ "CHECKPOINT_DIR = \"./checkpoints_dinov3_food101\"\n",
1449
+ "EVAL_EVERY_STEPS = 100\n",
1450
+ "\n",
1451
+ "optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=LR, weight_decay=WEIGHT_DECAY)\n",
1452
+ "total_steps = EPOCHS * math.ceil(len(train_loader))\n",
1453
+ "warmup_steps = int(WARMUP_RATIO * total_steps)\n",
1454
+ "scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)\n",
1455
+ "criterion = nn.CrossEntropyLoss()\n",
1456
+ "\n",
1457
+ "scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())"
1458
+ ],
1459
+ "metadata": {
1460
+ "colab": {
1461
+ "base_uri": "https://localhost:8080/"
1462
+ },
1463
+ "id": "WWM8KLQD_sya",
1464
+ "outputId": "1672c194-aad2-4af2-a9cf-e61aa0d558b9"
1465
+ },
1466
+ "execution_count": 14,
1467
+ "outputs": [
1468
+ {
1469
+ "output_type": "stream",
1470
+ "name": "stderr",
1471
+ "text": [
1472
+ "/tmp/ipython-input-593493728.py:19: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.\n",
1473
+ " scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())\n"
1474
+ ]
1475
+ }
1476
+ ]
1477
+ },
1478
+ {
1479
+ "cell_type": "code",
1480
+ "source": [
1481
+ "os.makedirs(\"./checkpoints_dinov3_food101\")"
1482
+ ],
1483
+ "metadata": {
1484
+ "id": "OJPRRz09kxFT"
1485
+ },
1486
+ "execution_count": 15,
1487
+ "outputs": []
1488
+ },
1489
+ {
1490
+ "cell_type": "markdown",
1491
+ "source": [
1492
+ "We need to evaluate during training."
1493
+ ],
1494
+ "metadata": {
1495
+ "id": "FHS5DSu1_22g"
1496
+ }
1497
+ },
1498
+ {
1499
+ "cell_type": "code",
1500
+ "source": [
1501
+ "def evaluate() -> Dict[str, float]:\n",
1502
+ " model.eval()\n",
1503
+ " correct, total, loss_sum = 0, 0, 0.0\n",
1504
+ " with torch.no_grad():\n",
1505
+ " for batch in val_loader:\n",
1506
+ " pixel_values = batch[\"pixel_values\"].to(device, non_blocking=True)\n",
1507
+ " labels = batch[\"labels\"].to(device, non_blocking=True)\n",
1508
+ " logits = model(pixel_values)\n",
1509
+ " loss = criterion(logits, labels)\n",
1510
+ " loss_sum += loss.item() * labels.size(0)\n",
1511
+ " preds = logits.argmax(dim=-1)\n",
1512
+ " correct += (preds == labels).sum().item()\n",
1513
+ " total += labels.size(0)\n",
1514
+ " return {\n",
1515
+ " \"val_loss\": loss_sum / max(total, 1),\n",
1516
+ " \"val_acc\": correct / max(total, 1),\n",
1517
+ " }"
1518
+ ],
1519
+ "metadata": {
1520
+ "id": "TSD4tzZr_4i3"
1521
+ },
1522
+ "execution_count": 16,
1523
+ "outputs": []
1524
+ },
1525
+ {
1526
+ "cell_type": "markdown",
1527
+ "source": [
1528
+ "Let's write the training loop. We'll also use trackio for experiment tracking."
1529
+ ],
1530
+ "metadata": {
1531
+ "id": "yakvOUOkAVcR"
1532
+ }
1533
+ },
1534
+ {
1535
+ "cell_type": "code",
1536
+ "execution_count": 17,
1537
+ "metadata": {
1538
+ "colab": {
1539
+ "base_uri": "https://localhost:8080/",
1540
+ "height": 723
1541
+ },
1542
+ "id": "r-WQGd7UyN1s",
1543
+ "outputId": "83e5aa69-4e4d-4c1d-c045-fc8ebad975ff"
1544
+ },
1545
+ "outputs": [
1546
+ {
1547
+ "output_type": "stream",
1548
+ "name": "stdout",
1549
+ "text": [
1550
+ "* Running on public URL: https://3669a91d39321f7f86.gradio.live\n",
1551
+ "* Trackio project initialized: dinov3\n",
1552
+ "* Trackio metrics logged to: /root/.cache/huggingface/trackio\n",
1553
+ "* View dashboard by running in your terminal:\n",
1554
+ "\u001b[1m\u001b[93mtrackio show --project \"dinov3\"\u001b[0m\n",
1555
+ "* or by running in Python: trackio.show(project=\"dinov3\")\n",
1556
+ "[epoch 1 | step 100] train_loss=4.4878 val_loss=4.0990 val_acc=50.77%\n",
1557
+ "[epoch 1 | step 200] train_loss=3.4722 val_loss=2.5605 val_acc=83.72%\n",
1558
+ "[epoch 1 | step 300] train_loss=1.9046 val_loss=1.2049 val_acc=87.09%\n",
1559
+ "[epoch 1 | step 400] train_loss=1.0664 val_loss=0.7385 val_acc=89.78%\n",
1560
+ "[epoch 1 | step 500] train_loss=0.7269 val_loss=0.5500 val_acc=90.30%\n"
1561
+ ]
1562
+ },
1563
+ {
1564
+ "output_type": "stream",
1565
+ "name": "stderr",
1566
+ "text": [
1567
+ "/usr/local/lib/python3.11/dist-packages/PIL/TiffImagePlugin.py:950: UserWarning: Truncated File Read\n",
1568
+ " warnings.warn(str(msg))\n"
1569
+ ]
1570
+ },
1571
+ {
1572
+ "output_type": "stream",
1573
+ "name": "stdout",
1574
+ "text": [
1575
+ "[epoch 1 | step 600] train_loss=0.6400 val_loss=0.4473 val_acc=91.92%\n",
1576
+ "[epoch 1 | step 700] train_loss=0.5444 val_loss=0.3916 val_acc=92.44%\n",
1577
+ "[epoch 1 | step 800] train_loss=0.5084 val_loss=0.3506 val_acc=92.08%\n"
1578
+ ]
1579
+ },
1580
+ {
1581
+ "output_type": "error",
1582
+ "ename": "KeyboardInterrupt",
1583
+ "evalue": "",
1584
+ "traceback": [
1585
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
1586
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
1587
+ "\u001b[0;32m/tmp/ipython-input-3838929309.py\u001b[0m in \u001b[0;36m<cell line: 0>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[0mscaler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscale\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mloss\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 26\u001b[0;31m \u001b[0mscaler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moptimizer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 27\u001b[0m \u001b[0mscaler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[0mscheduler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
1588
+ "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/amp/grad_scaler.py\u001b[0m in \u001b[0;36mstep\u001b[0;34m(self, optimizer, *args, **kwargs)\u001b[0m\n\u001b[1;32m 455\u001b[0m ), \"No inf checks were recorded for this optimizer.\"\n\u001b[1;32m 456\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 457\u001b[0;31m \u001b[0mretval\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_maybe_opt_step\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moptimizer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moptimizer_state\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 458\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 459\u001b[0m \u001b[0moptimizer_state\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"stage\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mOptState\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSTEPPED\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
1589
+ "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/amp/grad_scaler.py\u001b[0m in \u001b[0;36m_maybe_opt_step\u001b[0;34m(self, optimizer, optimizer_state, *args, **kwargs)\u001b[0m\n\u001b[1;32m 349\u001b[0m ) -> Optional[float]:\n\u001b[1;32m 350\u001b[0m \u001b[0mretval\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mfloat\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 351\u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32min\u001b[0m \u001b[0moptimizer_state\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"found_inf_per_device\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 352\u001b[0m \u001b[0mretval\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moptimizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 353\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mretval\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
1590
+ "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/amp/grad_scaler.py\u001b[0m in \u001b[0;36m<genexpr>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 349\u001b[0m ) -> Optional[float]:\n\u001b[1;32m 350\u001b[0m \u001b[0mretval\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mfloat\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 351\u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32min\u001b[0m \u001b[0moptimizer_state\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"found_inf_per_device\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 352\u001b[0m \u001b[0mretval\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moptimizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 353\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mretval\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
1591
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
1592
+ ]
1593
+ }
1594
+ ],
1595
+ "source": [
1596
+ "import trackio\n",
1597
+ "\n",
1598
+ "best_acc = 0.0\n",
1599
+ "global_step = 0\n",
1600
+ "\n",
1601
+ "trackio.init(project=\"dinov3\", config={\n",
1602
+ " \"epochs\": EPOCHS,\n",
1603
+ " \"learning_rate\": LR,\n",
1604
+ " \"batch_size\": BATCH_SIZE\n",
1605
+ " })\n",
1606
+ "\n",
1607
+ "for epoch in range(1, EPOCHS + 1):\n",
1608
+ " model.train()\n",
1609
+ " model.backbone.eval() # comment out if you want to train the whole model\n",
1610
+ "\n",
1611
+ " running_loss = 0.0\n",
1612
+ " for i, batch in enumerate(train_loader, start=1):\n",
1613
+ " pixel_values = batch[\"pixel_values\"].to(device, non_blocking=True)\n",
1614
+ " labels = batch[\"labels\"].to(device, non_blocking=True)\n",
1615
+ "\n",
1616
+ " optimizer.zero_grad(set_to_none=True)\n",
1617
+ " logits = model(pixel_values)\n",
1618
+ " loss = criterion(logits, labels)\n",
1619
+ "\n",
1620
+ " scaler.scale(loss).backward()\n",
1621
+ " scaler.step(optimizer)\n",
1622
+ " scaler.update()\n",
1623
+ " scheduler.step()\n",
1624
+ "\n",
1625
+ " running_loss += loss.item()\n",
1626
+ " global_step += 1\n",
1627
+ "\n",
1628
+ " if global_step % EVAL_EVERY_STEPS == 0:\n",
1629
+ " metrics = evaluate()\n",
1630
+ " print(\n",
1631
+ " f\"[epoch {epoch} | step {global_step}] \"\n",
1632
+ " f\"train_loss={running_loss / EVAL_EVERY_STEPS:.4f} \"\n",
1633
+ " f\"val_loss={metrics['val_loss']:.4f} val_acc={metrics['val_acc']*100:.2f}%\"\n",
1634
+ " )\n",
1635
+ " running_loss = 0.0\n",
1636
+ "\n",
1637
+ " trackio.log(\n",
1638
+ " {\n",
1639
+ " \"epoch\": epoch,\n",
1640
+ " \"val_acc\": best_acc,\n",
1641
+ " }\n",
1642
+ " )\n",
1643
+ "\n",
1644
+ " if metrics[\"val_acc\"] > best_acc:\n",
1645
+ " best_acc = metrics[\"val_acc\"]\n",
1646
+ " ckpt_path = os.path.join(CHECKPOINT_DIR, f\"best_acc_{best_acc:.4f}.pt\")\n",
1647
+ " torch.save(\n",
1648
+ " {\n",
1649
+ " \"model_state_dict\": model.state_dict(),\n",
1650
+ " \"optimizer_state_dict\": optimizer.state_dict(),\n",
1651
+ " \"scheduler_state_dict\": scheduler.state_dict(),\n",
1652
+ " \"config\": {\n",
1653
+ " \"model_name\": MODEL_NAME,\n",
1654
+ " \"num_classes\": num_classes,\n",
1655
+ " },\n",
1656
+ " \"step\": global_step,\n",
1657
+ " \"epoch\": epoch,\n",
1658
+ " },\n",
1659
+ " ckpt_path,\n",
1660
+ " )\n",
1661
+ "\n",
1662
+ "\n",
1663
+ " metrics = evaluate()\n",
1664
+ " print(\n",
1665
+ " f\"END EPOCH {epoch}: val_loss={metrics['val_loss']:.4f} val_acc={metrics['val_acc']*100:.2f}% \"\n",
1666
+ " f\"(best_acc={best_acc*100:.2f}%)\"\n",
1667
+ " )\n",
1668
+ " trackio.finish()"
1669
+ ]
1670
+ },
1671
+ {
1672
+ "cell_type": "code",
1673
+ "source": [
1674
+ "!trackio show"
1675
+ ],
1676
+ "metadata": {
1677
+ "id": "dX0kEHogATQ_"
1678
+ },
1679
+ "execution_count": null,
1680
+ "outputs": []
1681
+ },
1682
+ {
1683
+ "cell_type": "markdown",
1684
+ "source": [
1685
+ "Let's infer with the model, I have a few in the wild images."
1686
+ ],
1687
+ "metadata": {
1688
+ "id": "VKpGJ4L7bb2E"
1689
+ }
1690
+ },
1691
+ {
1692
+ "cell_type": "code",
1693
+ "source": [
1694
+ "import torch\n",
1695
+ "from PIL import Image\n",
1696
+ "from typing import List, Dict\n",
1697
+ "\n",
1698
+ "\n",
1699
+ "model.eval()\n",
1700
+ "\n",
1701
+ "images = [\"/content/pizza.jpg\", \"/content/spaghetti.JPG\"]\n",
1702
+ "\n",
1703
+ "pil_images = [Image.open(p).convert(\"RGB\") for p in images]\n",
1704
+ "inputs = image_processor(images=pil_images, return_tensors=\"pt\").to(device)\n",
1705
+ "\n",
1706
+ "with torch.no_grad():\n",
1707
+ " logits = model(inputs[\"pixel_values\"])\n",
1708
+ "\n",
1709
+ "# take top 2 classes\n",
1710
+ "probs = logits.softmax(dim=-1)\n",
1711
+ "scores, indices = probs.topk(2, dim=-1)\n",
1712
+ "\n",
1713
+ "results = []\n",
1714
+ "for path, idxs, scs in zip(images, indices, scores):\n",
1715
+ " preds = [\n",
1716
+ " {\"label_id\": int(i.item()),\n",
1717
+ " \"label\": id2label.get(int(i.item()), f\"class_{int(i)}\"),\n",
1718
+ " \"score\": float(s.item())}\n",
1719
+ " for i, s in zip(idxs, scs)\n",
1720
+ " ]\n",
1721
+ " results.append({\"image\": path, \"topk\": preds})\n"
1722
+ ],
1723
+ "metadata": {
1724
+ "id": "RGZntYQEaVbA"
1725
+ },
1726
+ "execution_count": 19,
1727
+ "outputs": []
1728
+ },
1729
+ {
1730
+ "cell_type": "markdown",
1731
+ "source": [
1732
+ "The model predicts correctly, which is expected given we only trained head with the great backbone frozen, it learned very fast. Feel free to try with more challenging use cases."
1733
+ ],
1734
+ "metadata": {
1735
+ "id": "bFoB-1Ebcab1"
1736
+ }
1737
+ },
1738
+ {
1739
+ "cell_type": "code",
1740
+ "source": [
1741
+ "results"
1742
+ ],
1743
+ "metadata": {
1744
+ "colab": {
1745
+ "base_uri": "https://localhost:8080/"
1746
+ },
1747
+ "id": "NrgtO2D1cXzj",
1748
+ "outputId": "c972e7d0-ee78-45d3-e91f-7c68521d6a0b"
1749
+ },
1750
+ "execution_count": 20,
1751
+ "outputs": [
1752
+ {
1753
+ "output_type": "execute_result",
1754
+ "data": {
1755
+ "text/plain": [
1756
+ "[{'image': '/content/pizza.jpg',\n",
1757
+ " 'topk': [{'label_id': 76, 'label': 'pizza', 'score': 0.7595003843307495},\n",
1758
+ " {'label_id': 35, 'label': 'escargots', 'score': 0.013227012008428574}]},\n",
1759
+ " {'image': '/content/spaghetti.JPG',\n",
1760
+ " 'topk': [{'label_id': 91,\n",
1761
+ " 'label': 'spaghetti_carbonara',\n",
1762
+ " 'score': 0.6622196435928345},\n",
1763
+ " {'label_id': 90,\n",
1764
+ " 'label': 'spaghetti_bolognese',\n",
1765
+ " 'score': 0.18182380497455597}]}]"
1766
+ ]
1767
+ },
1768
+ "metadata": {},
1769
+ "execution_count": 20
1770
+ }
1771
+ ]
1772
+ }
1773
+ ]
1774
+ }