Training in progress, step 10

Browse files

Files changed (9) hide show

README.md +68 -0
adapter_config.json +2 -2
adapter_model.safetensors +2 -2
chat_template.jinja +7 -0
preprocessor_config.json +11 -3
runs/Jul09_16-26-55_4137bc6253df/events.out.tfevents.1752078434.4137bc6253df.1221.0 +3 -0
tokenizer_config.json +1 -1
training_args.bin +2 -2
video_preprocessor_config.json +43 -0

README.md ADDED Viewed

	@@ -0,0 +1,68 @@

+---
+base_model: Qwen/Qwen2-VL-2B-Instruct
+library_name: transformers
+model_name: Qwen2-VL-2B-Instruct-Thinking
+tags:
+- generated_from_trainer
+- trl
+- grpo
+licence: license
+---
+# Model Card for Qwen2-VL-2B-Instruct-Thinking
+This model is a fine-tuned version of [Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="sergiopaniego/Qwen2-VL-2B-Instruct-Thinking", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
+### Framework versions
+- TRL: 0.20.0.dev0
+- Transformers: 4.53.0
+- Pytorch: 2.6.0+cu124
+- Datasets: 4.0.0
+- Tokenizers: 0.21.2
+## Citations
+Cite GRPO as:
+```bibtex
+@article{zhihong2024deepseekmath,
+    title        = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
+    author       = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
+    year         = 2024,
+    eprint       = {arXiv:2402.03300},
+}
+```
+Cite TRL as:
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```

adapter_config.json CHANGED Viewed

@@ -25,8 +25,8 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "q_proj",
-    "v_proj"
   ],
   "task_type": "CAUSAL_LM",
   "trainable_token_indices": null,

   "rank_pattern": {},
   "revision": null,
   "target_modules": [
+    "v_proj",
+    "q_proj"
   ],
   "task_type": "CAUSAL_LM",
   "trainable_token_indices": null,

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:76b2792cfc903006489ecd64f6481e5eee149daac9fe3922bb908aad28a06b06
-size 4372840

 version https://git-lfs.github.com/spec/v1
+oid sha256:0921cee8b015b9c091d809f1a7fac984690d48de3281f5b4cd1ac01bd5e67180
+size 4374520

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,7 @@

+{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
+You are a helpful assistant.<|im_end|>
+{% endif %}<|im_start|>{{ message['role'] }}
+{% if message['content'] is string %}{{ message['content'] }}<|im_end|>
+{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
+{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
+{% endif %}

preprocessor_config.json CHANGED Viewed

@@ -1,4 +1,10 @@
 {
   "do_convert_rgb": true,
   "do_normalize": true,
   "do_rescale": true,
@@ -8,12 +14,13 @@
     0.4578275,
     0.40821073
   ],
-  "image_processor_type": "Qwen2VLImageProcessor",
   "image_std": [
     0.26862954,
     0.26130258,
     0.27577711
   ],
   "max_pixels": 12845056,
   "merge_size": 2,
   "min_pixels": 3136,
@@ -21,9 +28,10 @@
   "processor_class": "Qwen2VLProcessor",
   "resample": 3,
   "rescale_factor": 0.00392156862745098,
   "size": {
-    "max_pixels": 12845056,
-    "min_pixels": 3136
   },
   "temporal_patch_size": 2
 }

 {
+  "crop_size": null,
+  "data_format": "channels_first",
+  "default_to_square": true,
+  "device": null,
+  "disable_grouping": null,
+  "do_center_crop": null,
   "do_convert_rgb": true,
   "do_normalize": true,
   "do_rescale": true,
     0.4578275,
     0.40821073
   ],
+  "image_processor_type": "Qwen2VLImageProcessorFast",
   "image_std": [
     0.26862954,
     0.26130258,
     0.27577711
   ],
+  "input_data_format": null,
   "max_pixels": 12845056,
   "merge_size": 2,
   "min_pixels": 3136,
   "processor_class": "Qwen2VLProcessor",
   "resample": 3,
   "rescale_factor": 0.00392156862745098,
+  "return_tensors": null,
   "size": {
+    "longest_edge": 12845056,
+    "shortest_edge": 3136
   },
   "temporal_patch_size": 2
 }

runs/Jul09_16-26-55_4137bc6253df/events.out.tfevents.1752078434.4137bc6253df.1221.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bcbafdd5495767223a924618b236fda5e68a3a9fba3dae13ae2e50d4340ecfde
+size 10057

tokenizer_config.json CHANGED Viewed

@@ -130,10 +130,10 @@
     "<|video_pad|>"
   ],
   "bos_token": null,
-  "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
   "clean_up_tokenization_spaces": false,
   "eos_token": "<|im_end|>",
   "errors": "replace",
   "model_max_length": 32768,
   "pad_token": "<|endoftext|>",
   "padding_side": "left",

     "<|video_pad|>"
   ],
   "bos_token": null,
   "clean_up_tokenization_spaces": false,
   "eos_token": "<|im_end|>",
   "errors": "replace",
+  "extra_special_tokens": {},
   "model_max_length": 32768,
   "pad_token": "<|endoftext|>",
   "padding_side": "left",

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6730ac38775f35d048b780843e3840177304a2e68360e603ba1bd9498f309d7d
-size 6072

 version https://git-lfs.github.com/spec/v1
+oid sha256:464a755bbb220bdbbd1b509b3017e935da00021c77af8eab707e79f40656ad33
+size 6520

video_preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "crop_size": null,
+  "data_format": "channels_first",
+  "default_to_square": true,
+  "device": null,
+  "do_center_crop": null,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_pad": null,
+  "do_rescale": true,
+  "do_resize": true,
+  "do_sample_frames": false,
+  "fps": null,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "input_data_format": null,
+  "max_frames": 768,
+  "max_pixels": 12845056,
+  "merge_size": 2,
+  "min_frames": 4,
+  "min_pixels": 3136,
+  "num_frames": null,
+  "patch_size": 14,
+  "processor_class": "Qwen2VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "longest_edge": 12845056,
+    "shortest_edge": 3136
+  },
+  "size_divisor": null,
+  "temporal_patch_size": 2,
+  "video_metadata": null,
+  "video_processor_type": "Qwen2VLVideoProcessor"
+}