| 
							 | 
						--- | 
					
					
						
						| 
							 | 
						library_name: transformers | 
					
					
						
						| 
							 | 
						license: apache-2.0 | 
					
					
						
						| 
							 | 
						base_model: Qwen/Qwen3-0.6B-Base | 
					
					
						
						| 
							 | 
						tags: | 
					
					
						
						| 
							 | 
						- axolotl | 
					
					
						
						| 
							 | 
						- generated_from_trainer | 
					
					
						
						| 
							 | 
						datasets: | 
					
					
						
						| 
							 | 
						- open-thoughts/OpenThoughts2-1M | 
					
					
						
						| 
							 | 
						model-index: | 
					
					
						
						| 
							 | 
						- name: base | 
					
					
						
						| 
							 | 
						  results: [] | 
					
					
						
						| 
							 | 
						--- | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						<!-- This model card has been generated automatically according to the information the Trainer had access to. You | 
					
					
						
						| 
							 | 
						should probably proofread and complete it, then remove this comment. --> | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						[<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl) | 
					
					
						
						| 
							 | 
						<details><summary>See axolotl config</summary> | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						axolotl version: `0.10.0.dev0` | 
					
					
						
						| 
							 | 
						```yaml | 
					
					
						
						| 
							 | 
						base_model: Qwen/Qwen3-0.6B-Base | 
					
					
						
						| 
							 | 
						hub_model_id: cyberbabooshka/base | 
					
					
						
						| 
							 | 
						wandb_name: base | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						tokenizer_type: AutoTokenizer | 
					
					
						
						| 
							 | 
						load_in_8bit: false | 
					
					
						
						| 
							 | 
						load_in_4bit: false | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						num_processes: 64 | 
					
					
						
						| 
							 | 
						dataset_processes: 64 | 
					
					
						
						| 
							 | 
						dataset_prepared_path: last_run_prepared | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						chat_template: jinja | 
					
					
						
						| 
							 | 
						chat_template_jinja: >- | 
					
					
						
						| 
							 | 
						  {%- if tools %} | 
					
					
						
						| 
							 | 
						      {{- '<|im_start|>system\n' }} | 
					
					
						
						| 
							 | 
						      {%- if messages[0].role == 'system' %} | 
					
					
						
						| 
							 | 
						          {{- messages[0].content + '\n\n' }} | 
					
					
						
						| 
							 | 
						      {%- endif %} | 
					
					
						
						| 
							 | 
						      {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }} | 
					
					
						
						| 
							 | 
						      {%- for tool in tools %} | 
					
					
						
						| 
							 | 
						          {{- "\n" }} | 
					
					
						
						| 
							 | 
						          {{- tool | tojson }} | 
					
					
						
						| 
							 | 
						      {%- endfor %} | 
					
					
						
						| 
							 | 
						      {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }} | 
					
					
						
						| 
							 | 
						  {%- else %} | 
					
					
						
						| 
							 | 
						      {%- if messages[0].role == 'system' %} | 
					
					
						
						| 
							 | 
						          {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} | 
					
					
						
						| 
							 | 
						      {%- endif %} | 
					
					
						
						| 
							 | 
						  {%- endif %} | 
					
					
						
						| 
							 | 
						  {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} | 
					
					
						
						| 
							 | 
						  {%- for message in messages[::-1] %} | 
					
					
						
						| 
							 | 
						      {%- set index = (messages|length - 1) - loop.index0 %} | 
					
					
						
						| 
							 | 
						      {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %} | 
					
					
						
						| 
							 | 
						          {%- set ns.multi_step_tool = false %} | 
					
					
						
						| 
							 | 
						          {%- set ns.last_query_index = index %} | 
					
					
						
						| 
							 | 
						      {%- endif %} | 
					
					
						
						| 
							 | 
						  {%- endfor %} | 
					
					
						
						| 
							 | 
						  {%- for message in messages %} | 
					
					
						
						| 
							 | 
						      {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} | 
					
					
						
						| 
							 | 
						          {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} | 
					
					
						
						| 
							 | 
						      {%- elif message.role == "assistant" %} | 
					
					
						
						| 
							 | 
						          {%- set content = message.content %} | 
					
					
						
						| 
							 | 
						          {%- set reasoning_content = '' %} | 
					
					
						
						| 
							 | 
						          {%- if message.reasoning_content is defined and message.reasoning_content is not none %} | 
					
					
						
						| 
							 | 
						              {%- set reasoning_content = message.reasoning_content %} | 
					
					
						
						| 
							 | 
						          {%- else %} | 
					
					
						
						| 
							 | 
						              {%- if '</think>' in message.content %} | 
					
					
						
						| 
							 | 
						                  {%- set content = message.content.split('</think>')[-1].lstrip('\n') %} | 
					
					
						
						| 
							 | 
						                  {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %} | 
					
					
						
						| 
							 | 
						              {%- endif %} | 
					
					
						
						| 
							 | 
						          {%- endif %} | 
					
					
						
						| 
							 | 
						          {%- if loop.index0 > ns.last_query_index %} | 
					
					
						
						| 
							 | 
						              {%- if loop.last or (not loop.last and reasoning_content) %} | 
					
					
						
						| 
							 | 
						                  {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }} | 
					
					
						
						| 
							 | 
						              {%- else %} | 
					
					
						
						| 
							 | 
						                  {{- '<|im_start|>' + message.role + '\n' + content }} | 
					
					
						
						| 
							 | 
						              {%- endif %} | 
					
					
						
						| 
							 | 
						          {%- else %} | 
					
					
						
						| 
							 | 
						              {{- '<|im_start|>' + message.role + '\n' + content }} | 
					
					
						
						| 
							 | 
						          {%- endif %} | 
					
					
						
						| 
							 | 
						          {%- if message.tool_calls %} | 
					
					
						
						| 
							 | 
						              {%- for tool_call in message.tool_calls %} | 
					
					
						
						| 
							 | 
						                  {%- if (loop.first and content) or (not loop.first) %} | 
					
					
						
						| 
							 | 
						                      {{- '\n' }} | 
					
					
						
						| 
							 | 
						                  {%- endif %} | 
					
					
						
						| 
							 | 
						                  {%- if tool_call.function %} | 
					
					
						
						| 
							 | 
						                      {%- set tool_call = tool_call.function %} | 
					
					
						
						| 
							 | 
						                  {%- endif %} | 
					
					
						
						| 
							 | 
						                  {{- '<tool_call>\n{"name": "' }} | 
					
					
						
						| 
							 | 
						                  {{- tool_call.name }} | 
					
					
						
						| 
							 | 
						                  {{- '", "arguments": ' }} | 
					
					
						
						| 
							 | 
						                  {%- if tool_call.arguments is string %} | 
					
					
						
						| 
							 | 
						                      {{- tool_call.arguments }} | 
					
					
						
						| 
							 | 
						                  {%- else %} | 
					
					
						
						| 
							 | 
						                      {{- tool_call.arguments | tojson }} | 
					
					
						
						| 
							 | 
						                  {%- endif %} | 
					
					
						
						| 
							 | 
						                  {{- '}\n</tool_call>' }} | 
					
					
						
						| 
							 | 
						              {%- endfor %} | 
					
					
						
						| 
							 | 
						          {%- endif %} | 
					
					
						
						| 
							 | 
						          {{- '<|im_end|>\n' }} | 
					
					
						
						| 
							 | 
						      {%- elif message.role == "tool" %} | 
					
					
						
						| 
							 | 
						          {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} | 
					
					
						
						| 
							 | 
						              {{- '<|im_start|>user' }} | 
					
					
						
						| 
							 | 
						          {%- endif %} | 
					
					
						
						| 
							 | 
						          {{- '\n<tool_response>\n' }} | 
					
					
						
						| 
							 | 
						          {{- message.content }} | 
					
					
						
						| 
							 | 
						          {{- '\n</tool_response>' }} | 
					
					
						
						| 
							 | 
						          {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} | 
					
					
						
						| 
							 | 
						              {{- '<|im_end|>\n' }} | 
					
					
						
						| 
							 | 
						          {%- endif %} | 
					
					
						
						| 
							 | 
						      {%- endif %} | 
					
					
						
						| 
							 | 
						  {%- endfor %} | 
					
					
						
						| 
							 | 
						  {%- if add_generation_prompt %} | 
					
					
						
						| 
							 | 
						      {{- '<|im_start|>assistant\n' }} | 
					
					
						
						| 
							 | 
						      {%- if enable_thinking is defined and enable_thinking is false %} | 
					
					
						
						| 
							 | 
						          {{- '<think>\n\n</think>\n\n' }} | 
					
					
						
						| 
							 | 
						      {%- else %} | 
					
					
						
						| 
							 | 
						          {{- '<think>\n' }} | 
					
					
						
						| 
							 | 
						      {%- endif %} | 
					
					
						
						| 
							 | 
						  {%- endif %} | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						datasets: | 
					
					
						
						| 
							 | 
						  - path: open-thoughts/OpenThoughts2-1M | 
					
					
						
						| 
							 | 
						    split: train[1%:] | 
					
					
						
						| 
							 | 
						    type: chat_template | 
					
					
						
						| 
							 | 
						    field_messages: conversations | 
					
					
						
						| 
							 | 
						    train_on_eos: turn | 
					
					
						
						| 
							 | 
						    train_on_eot: turn | 
					
					
						
						| 
							 | 
						    message_property_mappings: | 
					
					
						
						| 
							 | 
						      role: from | 
					
					
						
						| 
							 | 
						      content: value | 
					
					
						
						| 
							 | 
						    roles: | 
					
					
						
						| 
							 | 
						      user: | 
					
					
						
						| 
							 | 
						        - user | 
					
					
						
						| 
							 | 
						      assistant: | 
					
					
						
						| 
							 | 
						        - assistant | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						test_datasets: | 
					
					
						
						| 
							 | 
						  - path: open-thoughts/OpenThoughts2-1M | 
					
					
						
						| 
							 | 
						    split: train[:1%] | 
					
					
						
						| 
							 | 
						    type: chat_template | 
					
					
						
						| 
							 | 
						    field_messages: conversations | 
					
					
						
						| 
							 | 
						    train_on_eos: turn | 
					
					
						
						| 
							 | 
						    train_on_eot: turn | 
					
					
						
						| 
							 | 
						    message_property_mappings: | 
					
					
						
						| 
							 | 
						      role: from | 
					
					
						
						| 
							 | 
						      content: value | 
					
					
						
						| 
							 | 
						    roles: | 
					
					
						
						| 
							 | 
						      user: | 
					
					
						
						| 
							 | 
						        - user | 
					
					
						
						| 
							 | 
						      assistant: | 
					
					
						
						| 
							 | 
						        - assistant | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						output_dir: ./outputs | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						sequence_len: 9096 | 
					
					
						
						| 
							 | 
						batch_flattening: true | 
					
					
						
						| 
							 | 
						sample_packing: false | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						# adapter: lora | 
					
					
						
						| 
							 | 
						lora_model_dir: | 
					
					
						
						| 
							 | 
						lora_r: 64 | 
					
					
						
						| 
							 | 
						lora_alpha: 32 | 
					
					
						
						| 
							 | 
						lora_dropout: 0.0 | 
					
					
						
						| 
							 | 
						lora_target_modules: | 
					
					
						
						| 
							 | 
						  - embed_tokens | 
					
					
						
						| 
							 | 
						lora_target_linear: true | 
					
					
						
						| 
							 | 
						lora_on_cpu: false | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						wandb_project: mnlp | 
					
					
						
						| 
							 | 
						wandb_entity: aleksandr-dremov-epfl | 
					
					
						
						| 
							 | 
						wandb_watch: | 
					
					
						
						| 
							 | 
						wandb_log_model: | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						gradient_accumulation_steps: 2 | 
					
					
						
						| 
							 | 
						eval_batch_size: 16 | 
					
					
						
						| 
							 | 
						micro_batch_size: 4 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						optimizer: ademamix_8bit | 
					
					
						
						| 
							 | 
						weight_decay: 0.01 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						learning_rate: 0.00001 | 
					
					
						
						| 
							 | 
						warmup_steps: 500 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						wsd_final_lr_factor: 0.0 | 
					
					
						
						| 
							 | 
						wsd_init_div_factor: 100 | 
					
					
						
						| 
							 | 
						wsd_fract_decay: 0.2 | 
					
					
						
						| 
							 | 
						wsd_decay_type: "sqrt" | 
					
					
						
						| 
							 | 
						wsd_sqrt_power: 0.5 | 
					
					
						
						| 
							 | 
						wsd_cooldown_start_lr_factor: 1.0 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						bf16: auto | 
					
					
						
						| 
							 | 
						tf32: false | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						torch_compile: true | 
					
					
						
						| 
							 | 
						flash_attention: true | 
					
					
						
						| 
							 | 
						gradient_checkpointing: false | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						resume_from_checkpoint: | 
					
					
						
						| 
							 | 
						auto_resume_from_checkpoints: true | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						logging_steps: 16 | 
					
					
						
						| 
							 | 
						eval_steps: 2000 | 
					
					
						
						| 
							 | 
						save_steps: 1000 | 
					
					
						
						| 
							 | 
						max_steps: 40000 | 
					
					
						
						| 
							 | 
						num_epochs: 20000000 | 
					
					
						
						| 
							 | 
						save_total_limit: 2 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						special_tokens: | 
					
					
						
						| 
							 | 
						  eos_token: "<|im_end|>" | 
					
					
						
						| 
							 | 
						  pad_token: "<|endoftext|>" | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						eot_tokens: | 
					
					
						
						| 
							 | 
						  - <|im_end|> | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						plugins: | 
					
					
						
						| 
							 | 
						  - axolotl_wsd.WSDSchedulerPlugin | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						``` | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						</details><br> | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						# base | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						This model is a fine-tuned version of [Qwen/Qwen3-0.6B-Base](https://huggingface.co/Qwen/Qwen3-0.6B-Base) on the open-thoughts/OpenThoughts2-1M dataset. | 
					
					
						
						| 
							 | 
						It achieves the following results on the evaluation set: | 
					
					
						
						| 
							 | 
						- Loss: 0.5060 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						## Model description | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						More information needed | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						## Intended uses & limitations | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						More information needed | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						## Training and evaluation data | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						More information needed | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						## Training procedure | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						### Training hyperparameters | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						The following hyperparameters were used during training: | 
					
					
						
						| 
							 | 
						- learning_rate: 1e-05 | 
					
					
						
						| 
							 | 
						- train_batch_size: 4 | 
					
					
						
						| 
							 | 
						- eval_batch_size: 16 | 
					
					
						
						| 
							 | 
						- seed: 42 | 
					
					
						
						| 
							 | 
						- distributed_type: multi-GPU | 
					
					
						
						| 
							 | 
						- num_devices: 4 | 
					
					
						
						| 
							 | 
						- gradient_accumulation_steps: 2 | 
					
					
						
						| 
							 | 
						- total_train_batch_size: 32 | 
					
					
						
						| 
							 | 
						- total_eval_batch_size: 64 | 
					
					
						
						| 
							 | 
						- optimizer: Use OptimizerNames.ADEMAMIX_8BIT and the args are: | 
					
					
						
						| 
							 | 
						No additional optimizer arguments | 
					
					
						
						| 
							 | 
						- lr_scheduler_type: cosine | 
					
					
						
						| 
							 | 
						- lr_scheduler_warmup_steps: 500 | 
					
					
						
						| 
							 | 
						- training_steps: 40000 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						### Training results | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						| Training Loss | Epoch  | Step  | Validation Loss | | 
					
					
						
						| 
							 | 
						|:-------------:|:------:|:-----:|:---------------:| | 
					
					
						
						| 
							 | 
						| No log        | 0.0000 | 1     | 0.8524          | | 
					
					
						
						| 
							 | 
						| 0.5816        | 0.0671 | 2000  | 0.6038          | | 
					
					
						
						| 
							 | 
						| 0.554         | 0.1342 | 4000  | 0.5775          | | 
					
					
						
						| 
							 | 
						| 0.5746        | 0.2013 | 6000  | 0.5623          | | 
					
					
						
						| 
							 | 
						| 0.5304        | 0.2684 | 8000  | 0.5516          | | 
					
					
						
						| 
							 | 
						| 0.5334        | 0.3355 | 10000 | 0.5434          | | 
					
					
						
						| 
							 | 
						| 0.5378        | 0.4026 | 12000 | 0.5372          | | 
					
					
						
						| 
							 | 
						| 0.5205        | 0.4697 | 14000 | 0.5322          | | 
					
					
						
						| 
							 | 
						| 0.5301        | 0.5368 | 16000 | 0.5284          | | 
					
					
						
						| 
							 | 
						| 0.4979        | 0.6039 | 18000 | 0.5253          | | 
					
					
						
						| 
							 | 
						| 0.514         | 0.6710 | 20000 | 0.5225          | | 
					
					
						
						| 
							 | 
						| 0.5022        | 0.7381 | 22000 | 0.5202          | | 
					
					
						
						| 
							 | 
						| 0.5183        | 0.8052 | 24000 | 0.5187          | | 
					
					
						
						| 
							 | 
						| 0.4987        | 0.8724 | 26000 | 0.5175          | | 
					
					
						
						| 
							 | 
						| 0.5041        | 0.9395 | 28000 | 0.5161          | | 
					
					
						
						| 
							 | 
						| 0.4961        | 1.0066 | 30000 | 0.5159          | | 
					
					
						
						| 
							 | 
						| 0.4882        | 1.0737 | 32000 | 0.5161          | | 
					
					
						
						| 
							 | 
						| 0.5021        | 1.1408 | 34000 | 0.5117          | | 
					
					
						
						| 
							 | 
						| 0.4793        | 1.2079 | 36000 | 0.5093          | | 
					
					
						
						| 
							 | 
						| 0.4854        | 1.2750 | 38000 | 0.5071          | | 
					
					
						
						| 
							 | 
						| 0.4947        | 1.3421 | 40000 | 0.5060          | | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						### Framework versions | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						- Transformers 4.51.3 | 
					
					
						
						| 
							 | 
						- Pytorch 2.6.0+cu124 | 
					
					
						
						| 
							 | 
						- Datasets 3.5.0 | 
					
					
						
						| 
							 | 
						- Tokenizers 0.21.1 | 
					
					
						
						| 
							 | 
						
 |