Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.  
							See raw diff
- .gitattributes +3 -0
- README.md +58 -0
- added_tokens.json +28 -0
- chat_template.jinja +89 -0
- checkpoint-500/added_tokens.json +28 -0
- checkpoint-500/chat_template.jinja +89 -0
- checkpoint-500/config.json +68 -0
- checkpoint-500/generation_config.json +13 -0
- checkpoint-500/merges.txt +0 -0
- checkpoint-500/model-00001-of-00007.safetensors +3 -0
- checkpoint-500/model-00002-of-00007.safetensors +3 -0
- checkpoint-500/model-00007-of-00007.safetensors +3 -0
- checkpoint-500/model.safetensors.index.json +407 -0
- checkpoint-500/rng_state.pth +3 -0
- checkpoint-500/scheduler.pt +3 -0
- checkpoint-500/special_tokens_map.json +31 -0
- checkpoint-500/tokenizer.json +3 -0
- checkpoint-500/tokenizer_config.json +239 -0
- checkpoint-500/trainer_state.json +484 -0
- checkpoint-500/training_args.bin +3 -0
- checkpoint-500/vocab.json +0 -0
- checkpoint-570/added_tokens.json +28 -0
- checkpoint-570/chat_template.jinja +89 -0
- checkpoint-570/config.json +68 -0
- checkpoint-570/generation_config.json +13 -0
- checkpoint-570/merges.txt +0 -0
- checkpoint-570/model-00001-of-00007.safetensors +3 -0
- checkpoint-570/model-00007-of-00007.safetensors +3 -0
- checkpoint-570/model.safetensors.index.json +407 -0
- checkpoint-570/rng_state.pth +3 -0
- checkpoint-570/scheduler.pt +3 -0
- checkpoint-570/special_tokens_map.json +31 -0
- checkpoint-570/tokenizer.json +3 -0
- checkpoint-570/tokenizer_config.json +239 -0
- checkpoint-570/trainer_state.json +547 -0
- checkpoint-570/training_args.bin +3 -0
- checkpoint-570/vocab.json +0 -0
- config.json +68 -0
- generation_config.json +13 -0
- merges.txt +0 -0
- model-00001-of-00007.safetensors +3 -0
- model-00002-of-00007.safetensors +3 -0
- model-00003-of-00007.safetensors +3 -0
- model-00006-of-00007.safetensors +3 -0
- model-00007-of-00007.safetensors +3 -0
- model.safetensors.index.json +407 -0
- special_tokens_map.json +31 -0
- tokenizer.json +3 -0
- tokenizer_config.json +239 -0
- training_args.bin +3 -0
    	
        .gitattributes
    CHANGED
    
    | @@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text | |
| 33 | 
             
            *.zip filter=lfs diff=lfs merge=lfs -text
         | 
| 34 | 
             
            *.zst filter=lfs diff=lfs merge=lfs -text
         | 
| 35 | 
             
            *tfevents* filter=lfs diff=lfs merge=lfs -text
         | 
|  | |
|  | |
|  | 
|  | |
| 33 | 
             
            *.zip filter=lfs diff=lfs merge=lfs -text
         | 
| 34 | 
             
            *.zst filter=lfs diff=lfs merge=lfs -text
         | 
| 35 | 
             
            *tfevents* filter=lfs diff=lfs merge=lfs -text
         | 
| 36 | 
            +
            checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
         | 
| 37 | 
            +
            tokenizer.json filter=lfs diff=lfs merge=lfs -text
         | 
| 38 | 
            +
            checkpoint-570/tokenizer.json filter=lfs diff=lfs merge=lfs -text
         | 
    	
        README.md
    ADDED
    
    | @@ -0,0 +1,58 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            ---
         | 
| 2 | 
            +
            base_model: Qwen/Qwen3-8B
         | 
| 3 | 
            +
            library_name: transformers
         | 
| 4 | 
            +
            model_name: weights_4
         | 
| 5 | 
            +
            tags:
         | 
| 6 | 
            +
            - generated_from_trainer
         | 
| 7 | 
            +
            - sft
         | 
| 8 | 
            +
            - trl
         | 
| 9 | 
            +
            licence: license
         | 
| 10 | 
            +
            ---
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            # Model Card for weights_4
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            This model is a fine-tuned version of [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B).
         | 
| 15 | 
            +
            It has been trained using [TRL](https://github.com/huggingface/trl).
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            ## Quick start
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            ```python
         | 
| 20 | 
            +
            from transformers import pipeline
         | 
| 21 | 
            +
             | 
| 22 | 
            +
            question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
         | 
| 23 | 
            +
            generator = pipeline("text-generation", model="None", device="cuda")
         | 
| 24 | 
            +
            output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
         | 
| 25 | 
            +
            print(output["generated_text"])
         | 
| 26 | 
            +
            ```
         | 
| 27 | 
            +
             | 
| 28 | 
            +
            ## Training procedure
         | 
| 29 | 
            +
             | 
| 30 | 
            +
             
         | 
| 31 | 
            +
             | 
| 32 | 
            +
             | 
| 33 | 
            +
            This model was trained with SFT.
         | 
| 34 | 
            +
             | 
| 35 | 
            +
            ### Framework versions
         | 
| 36 | 
            +
             | 
| 37 | 
            +
            - TRL: 0.19.0
         | 
| 38 | 
            +
            - Transformers: 4.53.0
         | 
| 39 | 
            +
            - Pytorch: 2.7.0
         | 
| 40 | 
            +
            - Datasets: 3.6.0
         | 
| 41 | 
            +
            - Tokenizers: 0.21.2
         | 
| 42 | 
            +
             | 
| 43 | 
            +
            ## Citations
         | 
| 44 | 
            +
             | 
| 45 | 
            +
             | 
| 46 | 
            +
             | 
| 47 | 
            +
            Cite TRL as:
         | 
| 48 | 
            +
                
         | 
| 49 | 
            +
            ```bibtex
         | 
| 50 | 
            +
            @misc{vonwerra2022trl,
         | 
| 51 | 
            +
            	title        = {{TRL: Transformer Reinforcement Learning}},
         | 
| 52 | 
            +
            	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
         | 
| 53 | 
            +
            	year         = 2020,
         | 
| 54 | 
            +
            	journal      = {GitHub repository},
         | 
| 55 | 
            +
            	publisher    = {GitHub},
         | 
| 56 | 
            +
            	howpublished = {\url{https://github.com/huggingface/trl}}
         | 
| 57 | 
            +
            }
         | 
| 58 | 
            +
            ```
         | 
    	
        added_tokens.json
    ADDED
    
    | @@ -0,0 +1,28 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "</think>": 151668,
         | 
| 3 | 
            +
              "</tool_call>": 151658,
         | 
| 4 | 
            +
              "</tool_response>": 151666,
         | 
| 5 | 
            +
              "<think>": 151667,
         | 
| 6 | 
            +
              "<tool_call>": 151657,
         | 
| 7 | 
            +
              "<tool_response>": 151665,
         | 
| 8 | 
            +
              "<|box_end|>": 151649,
         | 
| 9 | 
            +
              "<|box_start|>": 151648,
         | 
| 10 | 
            +
              "<|endoftext|>": 151643,
         | 
| 11 | 
            +
              "<|file_sep|>": 151664,
         | 
| 12 | 
            +
              "<|fim_middle|>": 151660,
         | 
| 13 | 
            +
              "<|fim_pad|>": 151662,
         | 
| 14 | 
            +
              "<|fim_prefix|>": 151659,
         | 
| 15 | 
            +
              "<|fim_suffix|>": 151661,
         | 
| 16 | 
            +
              "<|im_end|>": 151645,
         | 
| 17 | 
            +
              "<|im_start|>": 151644,
         | 
| 18 | 
            +
              "<|image_pad|>": 151655,
         | 
| 19 | 
            +
              "<|object_ref_end|>": 151647,
         | 
| 20 | 
            +
              "<|object_ref_start|>": 151646,
         | 
| 21 | 
            +
              "<|quad_end|>": 151651,
         | 
| 22 | 
            +
              "<|quad_start|>": 151650,
         | 
| 23 | 
            +
              "<|repo_name|>": 151663,
         | 
| 24 | 
            +
              "<|video_pad|>": 151656,
         | 
| 25 | 
            +
              "<|vision_end|>": 151653,
         | 
| 26 | 
            +
              "<|vision_pad|>": 151654,
         | 
| 27 | 
            +
              "<|vision_start|>": 151652
         | 
| 28 | 
            +
            }
         | 
    	
        chat_template.jinja
    ADDED
    
    | @@ -0,0 +1,89 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {%- if tools %}
         | 
| 2 | 
            +
                {{- '<|im_start|>system\n' }}
         | 
| 3 | 
            +
                {%- if messages[0].role == 'system' %}
         | 
| 4 | 
            +
                    {{- messages[0].content + '\n\n' }}
         | 
| 5 | 
            +
                {%- endif %}
         | 
| 6 | 
            +
                {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
         | 
| 7 | 
            +
                {%- for tool in tools %}
         | 
| 8 | 
            +
                    {{- "\n" }}
         | 
| 9 | 
            +
                    {{- tool | tojson }}
         | 
| 10 | 
            +
                {%- endfor %}
         | 
| 11 | 
            +
                {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
         | 
| 12 | 
            +
            {%- else %}
         | 
| 13 | 
            +
                {%- if messages[0].role == 'system' %}
         | 
| 14 | 
            +
                    {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
         | 
| 15 | 
            +
                {%- endif %}
         | 
| 16 | 
            +
            {%- endif %}
         | 
| 17 | 
            +
            {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
         | 
| 18 | 
            +
            {%- for message in messages[::-1] %}
         | 
| 19 | 
            +
                {%- set index = (messages|length - 1) - loop.index0 %}
         | 
| 20 | 
            +
                {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
         | 
| 21 | 
            +
                    {%- set ns.multi_step_tool = false %}
         | 
| 22 | 
            +
                    {%- set ns.last_query_index = index %}
         | 
| 23 | 
            +
                {%- endif %}
         | 
| 24 | 
            +
            {%- endfor %}
         | 
| 25 | 
            +
            {%- for message in messages %}
         | 
| 26 | 
            +
                {%- if message.content is string %}
         | 
| 27 | 
            +
                    {%- set content = message.content %}
         | 
| 28 | 
            +
                {%- else %}
         | 
| 29 | 
            +
                    {%- set content = '' %}
         | 
| 30 | 
            +
                {%- endif %}
         | 
| 31 | 
            +
                {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
         | 
| 32 | 
            +
                    {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
         | 
| 33 | 
            +
                {%- elif message.role == "assistant" %}
         | 
| 34 | 
            +
                    {%- set reasoning_content = '' %}
         | 
| 35 | 
            +
                    {%- if message.reasoning_content is string %}
         | 
| 36 | 
            +
                        {%- set reasoning_content = message.reasoning_content %}
         | 
| 37 | 
            +
                    {%- else %}
         | 
| 38 | 
            +
                        {%- if '</think>' in content %}
         | 
| 39 | 
            +
                            {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
         | 
| 40 | 
            +
                            {%- set content = content.split('</think>')[-1].lstrip('\n') %}
         | 
| 41 | 
            +
                        {%- endif %}
         | 
| 42 | 
            +
                    {%- endif %}
         | 
| 43 | 
            +
                    {%- if loop.index0 > ns.last_query_index %}
         | 
| 44 | 
            +
                        {%- if loop.last or (not loop.last and reasoning_content) %}
         | 
| 45 | 
            +
                            {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
         | 
| 46 | 
            +
                        {%- else %}
         | 
| 47 | 
            +
                            {{- '<|im_start|>' + message.role + '\n' + content }}
         | 
| 48 | 
            +
                        {%- endif %}
         | 
| 49 | 
            +
                    {%- else %}
         | 
| 50 | 
            +
                        {{- '<|im_start|>' + message.role + '\n' + content }}
         | 
| 51 | 
            +
                    {%- endif %}
         | 
| 52 | 
            +
                    {%- if message.tool_calls %}
         | 
| 53 | 
            +
                        {%- for tool_call in message.tool_calls %}
         | 
| 54 | 
            +
                            {%- if (loop.first and content) or (not loop.first) %}
         | 
| 55 | 
            +
                                {{- '\n' }}
         | 
| 56 | 
            +
                            {%- endif %}
         | 
| 57 | 
            +
                            {%- if tool_call.function %}
         | 
| 58 | 
            +
                                {%- set tool_call = tool_call.function %}
         | 
| 59 | 
            +
                            {%- endif %}
         | 
| 60 | 
            +
                            {{- '<tool_call>\n{"name": "' }}
         | 
| 61 | 
            +
                            {{- tool_call.name }}
         | 
| 62 | 
            +
                            {{- '", "arguments": ' }}
         | 
| 63 | 
            +
                            {%- if tool_call.arguments is string %}
         | 
| 64 | 
            +
                                {{- tool_call.arguments }}
         | 
| 65 | 
            +
                            {%- else %}
         | 
| 66 | 
            +
                                {{- tool_call.arguments | tojson }}
         | 
| 67 | 
            +
                            {%- endif %}
         | 
| 68 | 
            +
                            {{- '}\n</tool_call>' }}
         | 
| 69 | 
            +
                        {%- endfor %}
         | 
| 70 | 
            +
                    {%- endif %}
         | 
| 71 | 
            +
                    {{- '<|im_end|>\n' }}
         | 
| 72 | 
            +
                {%- elif message.role == "tool" %}
         | 
| 73 | 
            +
                    {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
         | 
| 74 | 
            +
                        {{- '<|im_start|>user' }}
         | 
| 75 | 
            +
                    {%- endif %}
         | 
| 76 | 
            +
                    {{- '\n<tool_response>\n' }}
         | 
| 77 | 
            +
                    {{- content }}
         | 
| 78 | 
            +
                    {{- '\n</tool_response>' }}
         | 
| 79 | 
            +
                    {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
         | 
| 80 | 
            +
                        {{- '<|im_end|>\n' }}
         | 
| 81 | 
            +
                    {%- endif %}
         | 
| 82 | 
            +
                {%- endif %}
         | 
| 83 | 
            +
            {%- endfor %}
         | 
| 84 | 
            +
            {%- if add_generation_prompt %}
         | 
| 85 | 
            +
                {{- '<|im_start|>assistant\n' }}
         | 
| 86 | 
            +
                {%- if enable_thinking is defined and enable_thinking is false %}
         | 
| 87 | 
            +
                    {{- '<think>\n\n</think>\n\n' }}
         | 
| 88 | 
            +
                {%- endif %}
         | 
| 89 | 
            +
            {%- endif %}
         | 
    	
        checkpoint-500/added_tokens.json
    ADDED
    
    | @@ -0,0 +1,28 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "</think>": 151668,
         | 
| 3 | 
            +
              "</tool_call>": 151658,
         | 
| 4 | 
            +
              "</tool_response>": 151666,
         | 
| 5 | 
            +
              "<think>": 151667,
         | 
| 6 | 
            +
              "<tool_call>": 151657,
         | 
| 7 | 
            +
              "<tool_response>": 151665,
         | 
| 8 | 
            +
              "<|box_end|>": 151649,
         | 
| 9 | 
            +
              "<|box_start|>": 151648,
         | 
| 10 | 
            +
              "<|endoftext|>": 151643,
         | 
| 11 | 
            +
              "<|file_sep|>": 151664,
         | 
| 12 | 
            +
              "<|fim_middle|>": 151660,
         | 
| 13 | 
            +
              "<|fim_pad|>": 151662,
         | 
| 14 | 
            +
              "<|fim_prefix|>": 151659,
         | 
| 15 | 
            +
              "<|fim_suffix|>": 151661,
         | 
| 16 | 
            +
              "<|im_end|>": 151645,
         | 
| 17 | 
            +
              "<|im_start|>": 151644,
         | 
| 18 | 
            +
              "<|image_pad|>": 151655,
         | 
| 19 | 
            +
              "<|object_ref_end|>": 151647,
         | 
| 20 | 
            +
              "<|object_ref_start|>": 151646,
         | 
| 21 | 
            +
              "<|quad_end|>": 151651,
         | 
| 22 | 
            +
              "<|quad_start|>": 151650,
         | 
| 23 | 
            +
              "<|repo_name|>": 151663,
         | 
| 24 | 
            +
              "<|video_pad|>": 151656,
         | 
| 25 | 
            +
              "<|vision_end|>": 151653,
         | 
| 26 | 
            +
              "<|vision_pad|>": 151654,
         | 
| 27 | 
            +
              "<|vision_start|>": 151652
         | 
| 28 | 
            +
            }
         | 
    	
        checkpoint-500/chat_template.jinja
    ADDED
    
    | @@ -0,0 +1,89 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {%- if tools %}
         | 
| 2 | 
            +
                {{- '<|im_start|>system\n' }}
         | 
| 3 | 
            +
                {%- if messages[0].role == 'system' %}
         | 
| 4 | 
            +
                    {{- messages[0].content + '\n\n' }}
         | 
| 5 | 
            +
                {%- endif %}
         | 
| 6 | 
            +
                {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
         | 
| 7 | 
            +
                {%- for tool in tools %}
         | 
| 8 | 
            +
                    {{- "\n" }}
         | 
| 9 | 
            +
                    {{- tool | tojson }}
         | 
| 10 | 
            +
                {%- endfor %}
         | 
| 11 | 
            +
                {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
         | 
| 12 | 
            +
            {%- else %}
         | 
| 13 | 
            +
                {%- if messages[0].role == 'system' %}
         | 
| 14 | 
            +
                    {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
         | 
| 15 | 
            +
                {%- endif %}
         | 
| 16 | 
            +
            {%- endif %}
         | 
| 17 | 
            +
            {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
         | 
| 18 | 
            +
            {%- for message in messages[::-1] %}
         | 
| 19 | 
            +
                {%- set index = (messages|length - 1) - loop.index0 %}
         | 
| 20 | 
            +
                {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
         | 
| 21 | 
            +
                    {%- set ns.multi_step_tool = false %}
         | 
| 22 | 
            +
                    {%- set ns.last_query_index = index %}
         | 
| 23 | 
            +
                {%- endif %}
         | 
| 24 | 
            +
            {%- endfor %}
         | 
| 25 | 
            +
            {%- for message in messages %}
         | 
| 26 | 
            +
                {%- if message.content is string %}
         | 
| 27 | 
            +
                    {%- set content = message.content %}
         | 
| 28 | 
            +
                {%- else %}
         | 
| 29 | 
            +
                    {%- set content = '' %}
         | 
| 30 | 
            +
                {%- endif %}
         | 
| 31 | 
            +
                {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
         | 
| 32 | 
            +
                    {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
         | 
| 33 | 
            +
                {%- elif message.role == "assistant" %}
         | 
| 34 | 
            +
                    {%- set reasoning_content = '' %}
         | 
| 35 | 
            +
                    {%- if message.reasoning_content is string %}
         | 
| 36 | 
            +
                        {%- set reasoning_content = message.reasoning_content %}
         | 
| 37 | 
            +
                    {%- else %}
         | 
| 38 | 
            +
                        {%- if '</think>' in content %}
         | 
| 39 | 
            +
                            {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
         | 
| 40 | 
            +
                            {%- set content = content.split('</think>')[-1].lstrip('\n') %}
         | 
| 41 | 
            +
                        {%- endif %}
         | 
| 42 | 
            +
                    {%- endif %}
         | 
| 43 | 
            +
                    {%- if loop.index0 > ns.last_query_index %}
         | 
| 44 | 
            +
                        {%- if loop.last or (not loop.last and reasoning_content) %}
         | 
| 45 | 
            +
                            {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
         | 
| 46 | 
            +
                        {%- else %}
         | 
| 47 | 
            +
                            {{- '<|im_start|>' + message.role + '\n' + content }}
         | 
| 48 | 
            +
                        {%- endif %}
         | 
| 49 | 
            +
                    {%- else %}
         | 
| 50 | 
            +
                        {{- '<|im_start|>' + message.role + '\n' + content }}
         | 
| 51 | 
            +
                    {%- endif %}
         | 
| 52 | 
            +
                    {%- if message.tool_calls %}
         | 
| 53 | 
            +
                        {%- for tool_call in message.tool_calls %}
         | 
| 54 | 
            +
                            {%- if (loop.first and content) or (not loop.first) %}
         | 
| 55 | 
            +
                                {{- '\n' }}
         | 
| 56 | 
            +
                            {%- endif %}
         | 
| 57 | 
            +
                            {%- if tool_call.function %}
         | 
| 58 | 
            +
                                {%- set tool_call = tool_call.function %}
         | 
| 59 | 
            +
                            {%- endif %}
         | 
| 60 | 
            +
                            {{- '<tool_call>\n{"name": "' }}
         | 
| 61 | 
            +
                            {{- tool_call.name }}
         | 
| 62 | 
            +
                            {{- '", "arguments": ' }}
         | 
| 63 | 
            +
                            {%- if tool_call.arguments is string %}
         | 
| 64 | 
            +
                                {{- tool_call.arguments }}
         | 
| 65 | 
            +
                            {%- else %}
         | 
| 66 | 
            +
                                {{- tool_call.arguments | tojson }}
         | 
| 67 | 
            +
                            {%- endif %}
         | 
| 68 | 
            +
                            {{- '}\n</tool_call>' }}
         | 
| 69 | 
            +
                        {%- endfor %}
         | 
| 70 | 
            +
                    {%- endif %}
         | 
| 71 | 
            +
                    {{- '<|im_end|>\n' }}
         | 
| 72 | 
            +
                {%- elif message.role == "tool" %}
         | 
| 73 | 
            +
                    {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
         | 
| 74 | 
            +
                        {{- '<|im_start|>user' }}
         | 
| 75 | 
            +
                    {%- endif %}
         | 
| 76 | 
            +
                    {{- '\n<tool_response>\n' }}
         | 
| 77 | 
            +
                    {{- content }}
         | 
| 78 | 
            +
                    {{- '\n</tool_response>' }}
         | 
| 79 | 
            +
                    {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
         | 
| 80 | 
            +
                        {{- '<|im_end|>\n' }}
         | 
| 81 | 
            +
                    {%- endif %}
         | 
| 82 | 
            +
                {%- endif %}
         | 
| 83 | 
            +
            {%- endfor %}
         | 
| 84 | 
            +
            {%- if add_generation_prompt %}
         | 
| 85 | 
            +
                {{- '<|im_start|>assistant\n' }}
         | 
| 86 | 
            +
                {%- if enable_thinking is defined and enable_thinking is false %}
         | 
| 87 | 
            +
                    {{- '<think>\n\n</think>\n\n' }}
         | 
| 88 | 
            +
                {%- endif %}
         | 
| 89 | 
            +
            {%- endif %}
         | 
    	
        checkpoint-500/config.json
    ADDED
    
    | @@ -0,0 +1,68 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "architectures": [
         | 
| 3 | 
            +
                "Qwen3ForCausalLM"
         | 
| 4 | 
            +
              ],
         | 
| 5 | 
            +
              "attention_bias": false,
         | 
| 6 | 
            +
              "attention_dropout": 0.0,
         | 
| 7 | 
            +
              "bos_token_id": 151643,
         | 
| 8 | 
            +
              "eos_token_id": 151645,
         | 
| 9 | 
            +
              "head_dim": 128,
         | 
| 10 | 
            +
              "hidden_act": "silu",
         | 
| 11 | 
            +
              "hidden_size": 4096,
         | 
| 12 | 
            +
              "initializer_range": 0.02,
         | 
| 13 | 
            +
              "intermediate_size": 12288,
         | 
| 14 | 
            +
              "layer_types": [
         | 
| 15 | 
            +
                "full_attention",
         | 
| 16 | 
            +
                "full_attention",
         | 
| 17 | 
            +
                "full_attention",
         | 
| 18 | 
            +
                "full_attention",
         | 
| 19 | 
            +
                "full_attention",
         | 
| 20 | 
            +
                "full_attention",
         | 
| 21 | 
            +
                "full_attention",
         | 
| 22 | 
            +
                "full_attention",
         | 
| 23 | 
            +
                "full_attention",
         | 
| 24 | 
            +
                "full_attention",
         | 
| 25 | 
            +
                "full_attention",
         | 
| 26 | 
            +
                "full_attention",
         | 
| 27 | 
            +
                "full_attention",
         | 
| 28 | 
            +
                "full_attention",
         | 
| 29 | 
            +
                "full_attention",
         | 
| 30 | 
            +
                "full_attention",
         | 
| 31 | 
            +
                "full_attention",
         | 
| 32 | 
            +
                "full_attention",
         | 
| 33 | 
            +
                "full_attention",
         | 
| 34 | 
            +
                "full_attention",
         | 
| 35 | 
            +
                "full_attention",
         | 
| 36 | 
            +
                "full_attention",
         | 
| 37 | 
            +
                "full_attention",
         | 
| 38 | 
            +
                "full_attention",
         | 
| 39 | 
            +
                "full_attention",
         | 
| 40 | 
            +
                "full_attention",
         | 
| 41 | 
            +
                "full_attention",
         | 
| 42 | 
            +
                "full_attention",
         | 
| 43 | 
            +
                "full_attention",
         | 
| 44 | 
            +
                "full_attention",
         | 
| 45 | 
            +
                "full_attention",
         | 
| 46 | 
            +
                "full_attention",
         | 
| 47 | 
            +
                "full_attention",
         | 
| 48 | 
            +
                "full_attention",
         | 
| 49 | 
            +
                "full_attention",
         | 
| 50 | 
            +
                "full_attention"
         | 
| 51 | 
            +
              ],
         | 
| 52 | 
            +
              "max_position_embeddings": 40960,
         | 
| 53 | 
            +
              "max_window_layers": 36,
         | 
| 54 | 
            +
              "model_type": "qwen3",
         | 
| 55 | 
            +
              "num_attention_heads": 32,
         | 
| 56 | 
            +
              "num_hidden_layers": 36,
         | 
| 57 | 
            +
              "num_key_value_heads": 8,
         | 
| 58 | 
            +
              "rms_norm_eps": 1e-06,
         | 
| 59 | 
            +
              "rope_scaling": null,
         | 
| 60 | 
            +
              "rope_theta": 1000000,
         | 
| 61 | 
            +
              "sliding_window": null,
         | 
| 62 | 
            +
              "tie_word_embeddings": false,
         | 
| 63 | 
            +
              "torch_dtype": "float32",
         | 
| 64 | 
            +
              "transformers_version": "4.53.0",
         | 
| 65 | 
            +
              "use_cache": true,
         | 
| 66 | 
            +
              "use_sliding_window": false,
         | 
| 67 | 
            +
              "vocab_size": 151936
         | 
| 68 | 
            +
            }
         | 
    	
        checkpoint-500/generation_config.json
    ADDED
    
    | @@ -0,0 +1,13 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "bos_token_id": 151643,
         | 
| 3 | 
            +
              "do_sample": true,
         | 
| 4 | 
            +
              "eos_token_id": [
         | 
| 5 | 
            +
                151645,
         | 
| 6 | 
            +
                151643
         | 
| 7 | 
            +
              ],
         | 
| 8 | 
            +
              "pad_token_id": 151643,
         | 
| 9 | 
            +
              "temperature": 0.6,
         | 
| 10 | 
            +
              "top_k": 20,
         | 
| 11 | 
            +
              "top_p": 0.95,
         | 
| 12 | 
            +
              "transformers_version": "4.53.0"
         | 
| 13 | 
            +
            }
         | 
    	
        checkpoint-500/merges.txt
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        checkpoint-500/model-00001-of-00007.safetensors
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:7a2b025eb9ec6d742f73d319d387f54b91476a5d5bb00a4588d1fd2dcb87d369
         | 
| 3 | 
            +
            size 4972454376
         | 
    	
        checkpoint-500/model-00002-of-00007.safetensors
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:f79d1d181a0e6ceda9c2ee8ab6e605c18485f6b741dbc5789e2f4036c026025c
         | 
| 3 | 
            +
            size 4832048608
         | 
    	
        checkpoint-500/model-00007-of-00007.safetensors
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:a65a580d9ec401aa47414315a6ab7e40ac5b4a78f91c0707991b13c7cb9d9e03
         | 
| 3 | 
            +
            size 3462482728
         | 
    	
        checkpoint-500/model.safetensors.index.json
    ADDED
    
    | @@ -0,0 +1,407 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "metadata": {
         | 
| 3 | 
            +
                "total_parameters": 8190735360,
         | 
| 4 | 
            +
                "total_size": 32762941440
         | 
| 5 | 
            +
              },
         | 
| 6 | 
            +
              "weight_map": {
         | 
| 7 | 
            +
                "lm_head.weight": "model-00007-of-00007.safetensors",
         | 
| 8 | 
            +
                "model.embed_tokens.weight": "model-00001-of-00007.safetensors",
         | 
| 9 | 
            +
                "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors",
         | 
| 10 | 
            +
                "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 11 | 
            +
                "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 12 | 
            +
                "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 13 | 
            +
                "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
         | 
| 14 | 
            +
                "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00007.safetensors",
         | 
| 15 | 
            +
                "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 16 | 
            +
                "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 17 | 
            +
                "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00007.safetensors",
         | 
| 18 | 
            +
                "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 19 | 
            +
                "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 20 | 
            +
                "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors",
         | 
| 21 | 
            +
                "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 22 | 
            +
                "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 23 | 
            +
                "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 24 | 
            +
                "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
         | 
| 25 | 
            +
                "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00007.safetensors",
         | 
| 26 | 
            +
                "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 27 | 
            +
                "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 28 | 
            +
                "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00007.safetensors",
         | 
| 29 | 
            +
                "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 30 | 
            +
                "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 31 | 
            +
                "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 32 | 
            +
                "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 33 | 
            +
                "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 34 | 
            +
                "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 35 | 
            +
                "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 36 | 
            +
                "model.layers.10.self_attn.k_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 37 | 
            +
                "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 38 | 
            +
                "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 39 | 
            +
                "model.layers.10.self_attn.q_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 40 | 
            +
                "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 41 | 
            +
                "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 42 | 
            +
                "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 43 | 
            +
                "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 44 | 
            +
                "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 45 | 
            +
                "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 46 | 
            +
                "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 47 | 
            +
                "model.layers.11.self_attn.k_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 48 | 
            +
                "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 49 | 
            +
                "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 50 | 
            +
                "model.layers.11.self_attn.q_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 51 | 
            +
                "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 52 | 
            +
                "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 53 | 
            +
                "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 54 | 
            +
                "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 55 | 
            +
                "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 56 | 
            +
                "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 57 | 
            +
                "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 58 | 
            +
                "model.layers.12.self_attn.k_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 59 | 
            +
                "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 60 | 
            +
                "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 61 | 
            +
                "model.layers.12.self_attn.q_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 62 | 
            +
                "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 63 | 
            +
                "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 64 | 
            +
                "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 65 | 
            +
                "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 66 | 
            +
                "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 67 | 
            +
                "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 68 | 
            +
                "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 69 | 
            +
                "model.layers.13.self_attn.k_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 70 | 
            +
                "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 71 | 
            +
                "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 72 | 
            +
                "model.layers.13.self_attn.q_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 73 | 
            +
                "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 74 | 
            +
                "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 75 | 
            +
                "model.layers.14.input_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 76 | 
            +
                "model.layers.14.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 77 | 
            +
                "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 78 | 
            +
                "model.layers.14.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 79 | 
            +
                "model.layers.14.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 80 | 
            +
                "model.layers.14.self_attn.k_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 81 | 
            +
                "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 82 | 
            +
                "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 83 | 
            +
                "model.layers.14.self_attn.q_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 84 | 
            +
                "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 85 | 
            +
                "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 86 | 
            +
                "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 87 | 
            +
                "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 88 | 
            +
                "model.layers.15.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 89 | 
            +
                "model.layers.15.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 90 | 
            +
                "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 91 | 
            +
                "model.layers.15.self_attn.k_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 92 | 
            +
                "model.layers.15.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 93 | 
            +
                "model.layers.15.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 94 | 
            +
                "model.layers.15.self_attn.q_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 95 | 
            +
                "model.layers.15.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 96 | 
            +
                "model.layers.15.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 97 | 
            +
                "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 98 | 
            +
                "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 99 | 
            +
                "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 100 | 
            +
                "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 101 | 
            +
                "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 102 | 
            +
                "model.layers.16.self_attn.k_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 103 | 
            +
                "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 104 | 
            +
                "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 105 | 
            +
                "model.layers.16.self_attn.q_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 106 | 
            +
                "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 107 | 
            +
                "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 108 | 
            +
                "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 109 | 
            +
                "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 110 | 
            +
                "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 111 | 
            +
                "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 112 | 
            +
                "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 113 | 
            +
                "model.layers.17.self_attn.k_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 114 | 
            +
                "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 115 | 
            +
                "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 116 | 
            +
                "model.layers.17.self_attn.q_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 117 | 
            +
                "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 118 | 
            +
                "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 119 | 
            +
                "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 120 | 
            +
                "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 121 | 
            +
                "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 122 | 
            +
                "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 123 | 
            +
                "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 124 | 
            +
                "model.layers.18.self_attn.k_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 125 | 
            +
                "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 126 | 
            +
                "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 127 | 
            +
                "model.layers.18.self_attn.q_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 128 | 
            +
                "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 129 | 
            +
                "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 130 | 
            +
                "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 131 | 
            +
                "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 132 | 
            +
                "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 133 | 
            +
                "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 134 | 
            +
                "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 135 | 
            +
                "model.layers.19.self_attn.k_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 136 | 
            +
                "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 137 | 
            +
                "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 138 | 
            +
                "model.layers.19.self_attn.q_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 139 | 
            +
                "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 140 | 
            +
                "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 141 | 
            +
                "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors",
         | 
| 142 | 
            +
                "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 143 | 
            +
                "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 144 | 
            +
                "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 145 | 
            +
                "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
         | 
| 146 | 
            +
                "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00007.safetensors",
         | 
| 147 | 
            +
                "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 148 | 
            +
                "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 149 | 
            +
                "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00007.safetensors",
         | 
| 150 | 
            +
                "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 151 | 
            +
                "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 152 | 
            +
                "model.layers.20.input_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 153 | 
            +
                "model.layers.20.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 154 | 
            +
                "model.layers.20.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 155 | 
            +
                "model.layers.20.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 156 | 
            +
                "model.layers.20.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 157 | 
            +
                "model.layers.20.self_attn.k_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 158 | 
            +
                "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 159 | 
            +
                "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 160 | 
            +
                "model.layers.20.self_attn.q_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 161 | 
            +
                "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 162 | 
            +
                "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 163 | 
            +
                "model.layers.21.input_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 164 | 
            +
                "model.layers.21.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 165 | 
            +
                "model.layers.21.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 166 | 
            +
                "model.layers.21.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 167 | 
            +
                "model.layers.21.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 168 | 
            +
                "model.layers.21.self_attn.k_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 169 | 
            +
                "model.layers.21.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 170 | 
            +
                "model.layers.21.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 171 | 
            +
                "model.layers.21.self_attn.q_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 172 | 
            +
                "model.layers.21.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 173 | 
            +
                "model.layers.21.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 174 | 
            +
                "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 175 | 
            +
                "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 176 | 
            +
                "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 177 | 
            +
                "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 178 | 
            +
                "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 179 | 
            +
                "model.layers.22.self_attn.k_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 180 | 
            +
                "model.layers.22.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 181 | 
            +
                "model.layers.22.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 182 | 
            +
                "model.layers.22.self_attn.q_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 183 | 
            +
                "model.layers.22.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 184 | 
            +
                "model.layers.22.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 185 | 
            +
                "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 186 | 
            +
                "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 187 | 
            +
                "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 188 | 
            +
                "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 189 | 
            +
                "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 190 | 
            +
                "model.layers.23.self_attn.k_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 191 | 
            +
                "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 192 | 
            +
                "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 193 | 
            +
                "model.layers.23.self_attn.q_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 194 | 
            +
                "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 195 | 
            +
                "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 196 | 
            +
                "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 197 | 
            +
                "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 198 | 
            +
                "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 199 | 
            +
                "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 200 | 
            +
                "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 201 | 
            +
                "model.layers.24.self_attn.k_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 202 | 
            +
                "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 203 | 
            +
                "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 204 | 
            +
                "model.layers.24.self_attn.q_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 205 | 
            +
                "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 206 | 
            +
                "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 207 | 
            +
                "model.layers.25.input_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 208 | 
            +
                "model.layers.25.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 209 | 
            +
                "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 210 | 
            +
                "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 211 | 
            +
                "model.layers.25.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 212 | 
            +
                "model.layers.25.self_attn.k_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 213 | 
            +
                "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 214 | 
            +
                "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 215 | 
            +
                "model.layers.25.self_attn.q_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 216 | 
            +
                "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 217 | 
            +
                "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 218 | 
            +
                "model.layers.26.input_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 219 | 
            +
                "model.layers.26.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 220 | 
            +
                "model.layers.26.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 221 | 
            +
                "model.layers.26.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 222 | 
            +
                "model.layers.26.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 223 | 
            +
                "model.layers.26.self_attn.k_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 224 | 
            +
                "model.layers.26.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 225 | 
            +
                "model.layers.26.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 226 | 
            +
                "model.layers.26.self_attn.q_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 227 | 
            +
                "model.layers.26.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 228 | 
            +
                "model.layers.26.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 229 | 
            +
                "model.layers.27.input_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 230 | 
            +
                "model.layers.27.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 231 | 
            +
                "model.layers.27.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 232 | 
            +
                "model.layers.27.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 233 | 
            +
                "model.layers.27.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 234 | 
            +
                "model.layers.27.self_attn.k_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 235 | 
            +
                "model.layers.27.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 236 | 
            +
                "model.layers.27.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 237 | 
            +
                "model.layers.27.self_attn.q_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 238 | 
            +
                "model.layers.27.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 239 | 
            +
                "model.layers.27.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 240 | 
            +
                "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 241 | 
            +
                "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 242 | 
            +
                "model.layers.28.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 243 | 
            +
                "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 244 | 
            +
                "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 245 | 
            +
                "model.layers.28.self_attn.k_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 246 | 
            +
                "model.layers.28.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 247 | 
            +
                "model.layers.28.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 248 | 
            +
                "model.layers.28.self_attn.q_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 249 | 
            +
                "model.layers.28.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 250 | 
            +
                "model.layers.28.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 251 | 
            +
                "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 252 | 
            +
                "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 253 | 
            +
                "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 254 | 
            +
                "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 255 | 
            +
                "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 256 | 
            +
                "model.layers.29.self_attn.k_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 257 | 
            +
                "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 258 | 
            +
                "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 259 | 
            +
                "model.layers.29.self_attn.q_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 260 | 
            +
                "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 261 | 
            +
                "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 262 | 
            +
                "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 263 | 
            +
                "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 264 | 
            +
                "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 265 | 
            +
                "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 266 | 
            +
                "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 267 | 
            +
                "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00007.safetensors",
         | 
| 268 | 
            +
                "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 269 | 
            +
                "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 270 | 
            +
                "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00007.safetensors",
         | 
| 271 | 
            +
                "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 272 | 
            +
                "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 273 | 
            +
                "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 274 | 
            +
                "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 275 | 
            +
                "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 276 | 
            +
                "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 277 | 
            +
                "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 278 | 
            +
                "model.layers.30.self_attn.k_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 279 | 
            +
                "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 280 | 
            +
                "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 281 | 
            +
                "model.layers.30.self_attn.q_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 282 | 
            +
                "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 283 | 
            +
                "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 284 | 
            +
                "model.layers.31.input_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 285 | 
            +
                "model.layers.31.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 286 | 
            +
                "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 287 | 
            +
                "model.layers.31.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 288 | 
            +
                "model.layers.31.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 289 | 
            +
                "model.layers.31.self_attn.k_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 290 | 
            +
                "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 291 | 
            +
                "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 292 | 
            +
                "model.layers.31.self_attn.q_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 293 | 
            +
                "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 294 | 
            +
                "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 295 | 
            +
                "model.layers.32.input_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 296 | 
            +
                "model.layers.32.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 297 | 
            +
                "model.layers.32.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 298 | 
            +
                "model.layers.32.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 299 | 
            +
                "model.layers.32.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 300 | 
            +
                "model.layers.32.self_attn.k_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 301 | 
            +
                "model.layers.32.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 302 | 
            +
                "model.layers.32.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 303 | 
            +
                "model.layers.32.self_attn.q_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 304 | 
            +
                "model.layers.32.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 305 | 
            +
                "model.layers.32.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 306 | 
            +
                "model.layers.33.input_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 307 | 
            +
                "model.layers.33.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 308 | 
            +
                "model.layers.33.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 309 | 
            +
                "model.layers.33.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 310 | 
            +
                "model.layers.33.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 311 | 
            +
                "model.layers.33.self_attn.k_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 312 | 
            +
                "model.layers.33.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 313 | 
            +
                "model.layers.33.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 314 | 
            +
                "model.layers.33.self_attn.q_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 315 | 
            +
                "model.layers.33.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 316 | 
            +
                "model.layers.33.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 317 | 
            +
                "model.layers.34.input_layernorm.weight": "model-00007-of-00007.safetensors",
         | 
| 318 | 
            +
                "model.layers.34.mlp.down_proj.weight": "model-00007-of-00007.safetensors",
         | 
| 319 | 
            +
                "model.layers.34.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 320 | 
            +
                "model.layers.34.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 321 | 
            +
                "model.layers.34.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
         | 
| 322 | 
            +
                "model.layers.34.self_attn.k_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 323 | 
            +
                "model.layers.34.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 324 | 
            +
                "model.layers.34.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 325 | 
            +
                "model.layers.34.self_attn.q_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 326 | 
            +
                "model.layers.34.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 327 | 
            +
                "model.layers.34.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 328 | 
            +
                "model.layers.35.input_layernorm.weight": "model-00007-of-00007.safetensors",
         | 
| 329 | 
            +
                "model.layers.35.mlp.down_proj.weight": "model-00007-of-00007.safetensors",
         | 
| 330 | 
            +
                "model.layers.35.mlp.gate_proj.weight": "model-00007-of-00007.safetensors",
         | 
| 331 | 
            +
                "model.layers.35.mlp.up_proj.weight": "model-00007-of-00007.safetensors",
         | 
| 332 | 
            +
                "model.layers.35.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
         | 
| 333 | 
            +
                "model.layers.35.self_attn.k_norm.weight": "model-00007-of-00007.safetensors",
         | 
| 334 | 
            +
                "model.layers.35.self_attn.k_proj.weight": "model-00007-of-00007.safetensors",
         | 
| 335 | 
            +
                "model.layers.35.self_attn.o_proj.weight": "model-00007-of-00007.safetensors",
         | 
| 336 | 
            +
                "model.layers.35.self_attn.q_norm.weight": "model-00007-of-00007.safetensors",
         | 
| 337 | 
            +
                "model.layers.35.self_attn.q_proj.weight": "model-00007-of-00007.safetensors",
         | 
| 338 | 
            +
                "model.layers.35.self_attn.v_proj.weight": "model-00007-of-00007.safetensors",
         | 
| 339 | 
            +
                "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 340 | 
            +
                "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 341 | 
            +
                "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 342 | 
            +
                "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 343 | 
            +
                "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 344 | 
            +
                "model.layers.4.self_attn.k_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 345 | 
            +
                "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 346 | 
            +
                "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 347 | 
            +
                "model.layers.4.self_attn.q_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 348 | 
            +
                "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 349 | 
            +
                "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 350 | 
            +
                "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 351 | 
            +
                "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 352 | 
            +
                "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 353 | 
            +
                "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 354 | 
            +
                "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 355 | 
            +
                "model.layers.5.self_attn.k_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 356 | 
            +
                "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 357 | 
            +
                "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 358 | 
            +
                "model.layers.5.self_attn.q_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 359 | 
            +
                "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 360 | 
            +
                "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 361 | 
            +
                "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 362 | 
            +
                "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 363 | 
            +
                "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 364 | 
            +
                "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 365 | 
            +
                "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 366 | 
            +
                "model.layers.6.self_attn.k_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 367 | 
            +
                "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 368 | 
            +
                "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 369 | 
            +
                "model.layers.6.self_attn.q_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 370 | 
            +
                "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 371 | 
            +
                "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 372 | 
            +
                "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 373 | 
            +
                "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 374 | 
            +
                "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 375 | 
            +
                "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 376 | 
            +
                "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 377 | 
            +
                "model.layers.7.self_attn.k_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 378 | 
            +
                "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 379 | 
            +
                "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 380 | 
            +
                "model.layers.7.self_attn.q_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 381 | 
            +
                "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 382 | 
            +
                "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 383 | 
            +
                "model.layers.8.input_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 384 | 
            +
                "model.layers.8.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 385 | 
            +
                "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 386 | 
            +
                "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 387 | 
            +
                "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 388 | 
            +
                "model.layers.8.self_attn.k_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 389 | 
            +
                "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 390 | 
            +
                "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 391 | 
            +
                "model.layers.8.self_attn.q_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 392 | 
            +
                "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 393 | 
            +
                "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 394 | 
            +
                "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 395 | 
            +
                "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 396 | 
            +
                "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 397 | 
            +
                "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 398 | 
            +
                "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 399 | 
            +
                "model.layers.9.self_attn.k_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 400 | 
            +
                "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 401 | 
            +
                "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 402 | 
            +
                "model.layers.9.self_attn.q_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 403 | 
            +
                "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 404 | 
            +
                "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 405 | 
            +
                "model.norm.weight": "model-00007-of-00007.safetensors"
         | 
| 406 | 
            +
              }
         | 
| 407 | 
            +
            }
         | 
    	
        checkpoint-500/rng_state.pth
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:2534e434cd5abbb8f7668d3eab0549db0ef95d6a797a3efa86b712e8e32266a7
         | 
| 3 | 
            +
            size 14645
         | 
    	
        checkpoint-500/scheduler.pt
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:dae9e2440856d0a3ab8ef9d108eae199f62a18cdec8eedf480cefde2044df26a
         | 
| 3 | 
            +
            size 1465
         | 
    	
        checkpoint-500/special_tokens_map.json
    ADDED
    
    | @@ -0,0 +1,31 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "additional_special_tokens": [
         | 
| 3 | 
            +
                "<|im_start|>",
         | 
| 4 | 
            +
                "<|im_end|>",
         | 
| 5 | 
            +
                "<|object_ref_start|>",
         | 
| 6 | 
            +
                "<|object_ref_end|>",
         | 
| 7 | 
            +
                "<|box_start|>",
         | 
| 8 | 
            +
                "<|box_end|>",
         | 
| 9 | 
            +
                "<|quad_start|>",
         | 
| 10 | 
            +
                "<|quad_end|>",
         | 
| 11 | 
            +
                "<|vision_start|>",
         | 
| 12 | 
            +
                "<|vision_end|>",
         | 
| 13 | 
            +
                "<|vision_pad|>",
         | 
| 14 | 
            +
                "<|image_pad|>",
         | 
| 15 | 
            +
                "<|video_pad|>"
         | 
| 16 | 
            +
              ],
         | 
| 17 | 
            +
              "eos_token": {
         | 
| 18 | 
            +
                "content": "<|im_end|>",
         | 
| 19 | 
            +
                "lstrip": false,
         | 
| 20 | 
            +
                "normalized": false,
         | 
| 21 | 
            +
                "rstrip": false,
         | 
| 22 | 
            +
                "single_word": false
         | 
| 23 | 
            +
              },
         | 
| 24 | 
            +
              "pad_token": {
         | 
| 25 | 
            +
                "content": "<|endoftext|>",
         | 
| 26 | 
            +
                "lstrip": false,
         | 
| 27 | 
            +
                "normalized": false,
         | 
| 28 | 
            +
                "rstrip": false,
         | 
| 29 | 
            +
                "single_word": false
         | 
| 30 | 
            +
              }
         | 
| 31 | 
            +
            }
         | 
    	
        checkpoint-500/tokenizer.json
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
         | 
| 3 | 
            +
            size 11422654
         | 
    	
        checkpoint-500/tokenizer_config.json
    ADDED
    
    | @@ -0,0 +1,239 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "add_bos_token": false,
         | 
| 3 | 
            +
              "add_prefix_space": false,
         | 
| 4 | 
            +
              "added_tokens_decoder": {
         | 
| 5 | 
            +
                "151643": {
         | 
| 6 | 
            +
                  "content": "<|endoftext|>",
         | 
| 7 | 
            +
                  "lstrip": false,
         | 
| 8 | 
            +
                  "normalized": false,
         | 
| 9 | 
            +
                  "rstrip": false,
         | 
| 10 | 
            +
                  "single_word": false,
         | 
| 11 | 
            +
                  "special": true
         | 
| 12 | 
            +
                },
         | 
| 13 | 
            +
                "151644": {
         | 
| 14 | 
            +
                  "content": "<|im_start|>",
         | 
| 15 | 
            +
                  "lstrip": false,
         | 
| 16 | 
            +
                  "normalized": false,
         | 
| 17 | 
            +
                  "rstrip": false,
         | 
| 18 | 
            +
                  "single_word": false,
         | 
| 19 | 
            +
                  "special": true
         | 
| 20 | 
            +
                },
         | 
| 21 | 
            +
                "151645": {
         | 
| 22 | 
            +
                  "content": "<|im_end|>",
         | 
| 23 | 
            +
                  "lstrip": false,
         | 
| 24 | 
            +
                  "normalized": false,
         | 
| 25 | 
            +
                  "rstrip": false,
         | 
| 26 | 
            +
                  "single_word": false,
         | 
| 27 | 
            +
                  "special": true
         | 
| 28 | 
            +
                },
         | 
| 29 | 
            +
                "151646": {
         | 
| 30 | 
            +
                  "content": "<|object_ref_start|>",
         | 
| 31 | 
            +
                  "lstrip": false,
         | 
| 32 | 
            +
                  "normalized": false,
         | 
| 33 | 
            +
                  "rstrip": false,
         | 
| 34 | 
            +
                  "single_word": false,
         | 
| 35 | 
            +
                  "special": true
         | 
| 36 | 
            +
                },
         | 
| 37 | 
            +
                "151647": {
         | 
| 38 | 
            +
                  "content": "<|object_ref_end|>",
         | 
| 39 | 
            +
                  "lstrip": false,
         | 
| 40 | 
            +
                  "normalized": false,
         | 
| 41 | 
            +
                  "rstrip": false,
         | 
| 42 | 
            +
                  "single_word": false,
         | 
| 43 | 
            +
                  "special": true
         | 
| 44 | 
            +
                },
         | 
| 45 | 
            +
                "151648": {
         | 
| 46 | 
            +
                  "content": "<|box_start|>",
         | 
| 47 | 
            +
                  "lstrip": false,
         | 
| 48 | 
            +
                  "normalized": false,
         | 
| 49 | 
            +
                  "rstrip": false,
         | 
| 50 | 
            +
                  "single_word": false,
         | 
| 51 | 
            +
                  "special": true
         | 
| 52 | 
            +
                },
         | 
| 53 | 
            +
                "151649": {
         | 
| 54 | 
            +
                  "content": "<|box_end|>",
         | 
| 55 | 
            +
                  "lstrip": false,
         | 
| 56 | 
            +
                  "normalized": false,
         | 
| 57 | 
            +
                  "rstrip": false,
         | 
| 58 | 
            +
                  "single_word": false,
         | 
| 59 | 
            +
                  "special": true
         | 
| 60 | 
            +
                },
         | 
| 61 | 
            +
                "151650": {
         | 
| 62 | 
            +
                  "content": "<|quad_start|>",
         | 
| 63 | 
            +
                  "lstrip": false,
         | 
| 64 | 
            +
                  "normalized": false,
         | 
| 65 | 
            +
                  "rstrip": false,
         | 
| 66 | 
            +
                  "single_word": false,
         | 
| 67 | 
            +
                  "special": true
         | 
| 68 | 
            +
                },
         | 
| 69 | 
            +
                "151651": {
         | 
| 70 | 
            +
                  "content": "<|quad_end|>",
         | 
| 71 | 
            +
                  "lstrip": false,
         | 
| 72 | 
            +
                  "normalized": false,
         | 
| 73 | 
            +
                  "rstrip": false,
         | 
| 74 | 
            +
                  "single_word": false,
         | 
| 75 | 
            +
                  "special": true
         | 
| 76 | 
            +
                },
         | 
| 77 | 
            +
                "151652": {
         | 
| 78 | 
            +
                  "content": "<|vision_start|>",
         | 
| 79 | 
            +
                  "lstrip": false,
         | 
| 80 | 
            +
                  "normalized": false,
         | 
| 81 | 
            +
                  "rstrip": false,
         | 
| 82 | 
            +
                  "single_word": false,
         | 
| 83 | 
            +
                  "special": true
         | 
| 84 | 
            +
                },
         | 
| 85 | 
            +
                "151653": {
         | 
| 86 | 
            +
                  "content": "<|vision_end|>",
         | 
| 87 | 
            +
                  "lstrip": false,
         | 
| 88 | 
            +
                  "normalized": false,
         | 
| 89 | 
            +
                  "rstrip": false,
         | 
| 90 | 
            +
                  "single_word": false,
         | 
| 91 | 
            +
                  "special": true
         | 
| 92 | 
            +
                },
         | 
| 93 | 
            +
                "151654": {
         | 
| 94 | 
            +
                  "content": "<|vision_pad|>",
         | 
| 95 | 
            +
                  "lstrip": false,
         | 
| 96 | 
            +
                  "normalized": false,
         | 
| 97 | 
            +
                  "rstrip": false,
         | 
| 98 | 
            +
                  "single_word": false,
         | 
| 99 | 
            +
                  "special": true
         | 
| 100 | 
            +
                },
         | 
| 101 | 
            +
                "151655": {
         | 
| 102 | 
            +
                  "content": "<|image_pad|>",
         | 
| 103 | 
            +
                  "lstrip": false,
         | 
| 104 | 
            +
                  "normalized": false,
         | 
| 105 | 
            +
                  "rstrip": false,
         | 
| 106 | 
            +
                  "single_word": false,
         | 
| 107 | 
            +
                  "special": true
         | 
| 108 | 
            +
                },
         | 
| 109 | 
            +
                "151656": {
         | 
| 110 | 
            +
                  "content": "<|video_pad|>",
         | 
| 111 | 
            +
                  "lstrip": false,
         | 
| 112 | 
            +
                  "normalized": false,
         | 
| 113 | 
            +
                  "rstrip": false,
         | 
| 114 | 
            +
                  "single_word": false,
         | 
| 115 | 
            +
                  "special": true
         | 
| 116 | 
            +
                },
         | 
| 117 | 
            +
                "151657": {
         | 
| 118 | 
            +
                  "content": "<tool_call>",
         | 
| 119 | 
            +
                  "lstrip": false,
         | 
| 120 | 
            +
                  "normalized": false,
         | 
| 121 | 
            +
                  "rstrip": false,
         | 
| 122 | 
            +
                  "single_word": false,
         | 
| 123 | 
            +
                  "special": false
         | 
| 124 | 
            +
                },
         | 
| 125 | 
            +
                "151658": {
         | 
| 126 | 
            +
                  "content": "</tool_call>",
         | 
| 127 | 
            +
                  "lstrip": false,
         | 
| 128 | 
            +
                  "normalized": false,
         | 
| 129 | 
            +
                  "rstrip": false,
         | 
| 130 | 
            +
                  "single_word": false,
         | 
| 131 | 
            +
                  "special": false
         | 
| 132 | 
            +
                },
         | 
| 133 | 
            +
                "151659": {
         | 
| 134 | 
            +
                  "content": "<|fim_prefix|>",
         | 
| 135 | 
            +
                  "lstrip": false,
         | 
| 136 | 
            +
                  "normalized": false,
         | 
| 137 | 
            +
                  "rstrip": false,
         | 
| 138 | 
            +
                  "single_word": false,
         | 
| 139 | 
            +
                  "special": false
         | 
| 140 | 
            +
                },
         | 
| 141 | 
            +
                "151660": {
         | 
| 142 | 
            +
                  "content": "<|fim_middle|>",
         | 
| 143 | 
            +
                  "lstrip": false,
         | 
| 144 | 
            +
                  "normalized": false,
         | 
| 145 | 
            +
                  "rstrip": false,
         | 
| 146 | 
            +
                  "single_word": false,
         | 
| 147 | 
            +
                  "special": false
         | 
| 148 | 
            +
                },
         | 
| 149 | 
            +
                "151661": {
         | 
| 150 | 
            +
                  "content": "<|fim_suffix|>",
         | 
| 151 | 
            +
                  "lstrip": false,
         | 
| 152 | 
            +
                  "normalized": false,
         | 
| 153 | 
            +
                  "rstrip": false,
         | 
| 154 | 
            +
                  "single_word": false,
         | 
| 155 | 
            +
                  "special": false
         | 
| 156 | 
            +
                },
         | 
| 157 | 
            +
                "151662": {
         | 
| 158 | 
            +
                  "content": "<|fim_pad|>",
         | 
| 159 | 
            +
                  "lstrip": false,
         | 
| 160 | 
            +
                  "normalized": false,
         | 
| 161 | 
            +
                  "rstrip": false,
         | 
| 162 | 
            +
                  "single_word": false,
         | 
| 163 | 
            +
                  "special": false
         | 
| 164 | 
            +
                },
         | 
| 165 | 
            +
                "151663": {
         | 
| 166 | 
            +
                  "content": "<|repo_name|>",
         | 
| 167 | 
            +
                  "lstrip": false,
         | 
| 168 | 
            +
                  "normalized": false,
         | 
| 169 | 
            +
                  "rstrip": false,
         | 
| 170 | 
            +
                  "single_word": false,
         | 
| 171 | 
            +
                  "special": false
         | 
| 172 | 
            +
                },
         | 
| 173 | 
            +
                "151664": {
         | 
| 174 | 
            +
                  "content": "<|file_sep|>",
         | 
| 175 | 
            +
                  "lstrip": false,
         | 
| 176 | 
            +
                  "normalized": false,
         | 
| 177 | 
            +
                  "rstrip": false,
         | 
| 178 | 
            +
                  "single_word": false,
         | 
| 179 | 
            +
                  "special": false
         | 
| 180 | 
            +
                },
         | 
| 181 | 
            +
                "151665": {
         | 
| 182 | 
            +
                  "content": "<tool_response>",
         | 
| 183 | 
            +
                  "lstrip": false,
         | 
| 184 | 
            +
                  "normalized": false,
         | 
| 185 | 
            +
                  "rstrip": false,
         | 
| 186 | 
            +
                  "single_word": false,
         | 
| 187 | 
            +
                  "special": false
         | 
| 188 | 
            +
                },
         | 
| 189 | 
            +
                "151666": {
         | 
| 190 | 
            +
                  "content": "</tool_response>",
         | 
| 191 | 
            +
                  "lstrip": false,
         | 
| 192 | 
            +
                  "normalized": false,
         | 
| 193 | 
            +
                  "rstrip": false,
         | 
| 194 | 
            +
                  "single_word": false,
         | 
| 195 | 
            +
                  "special": false
         | 
| 196 | 
            +
                },
         | 
| 197 | 
            +
                "151667": {
         | 
| 198 | 
            +
                  "content": "<think>",
         | 
| 199 | 
            +
                  "lstrip": false,
         | 
| 200 | 
            +
                  "normalized": false,
         | 
| 201 | 
            +
                  "rstrip": false,
         | 
| 202 | 
            +
                  "single_word": false,
         | 
| 203 | 
            +
                  "special": false
         | 
| 204 | 
            +
                },
         | 
| 205 | 
            +
                "151668": {
         | 
| 206 | 
            +
                  "content": "</think>",
         | 
| 207 | 
            +
                  "lstrip": false,
         | 
| 208 | 
            +
                  "normalized": false,
         | 
| 209 | 
            +
                  "rstrip": false,
         | 
| 210 | 
            +
                  "single_word": false,
         | 
| 211 | 
            +
                  "special": false
         | 
| 212 | 
            +
                }
         | 
| 213 | 
            +
              },
         | 
| 214 | 
            +
              "additional_special_tokens": [
         | 
| 215 | 
            +
                "<|im_start|>",
         | 
| 216 | 
            +
                "<|im_end|>",
         | 
| 217 | 
            +
                "<|object_ref_start|>",
         | 
| 218 | 
            +
                "<|object_ref_end|>",
         | 
| 219 | 
            +
                "<|box_start|>",
         | 
| 220 | 
            +
                "<|box_end|>",
         | 
| 221 | 
            +
                "<|quad_start|>",
         | 
| 222 | 
            +
                "<|quad_end|>",
         | 
| 223 | 
            +
                "<|vision_start|>",
         | 
| 224 | 
            +
                "<|vision_end|>",
         | 
| 225 | 
            +
                "<|vision_pad|>",
         | 
| 226 | 
            +
                "<|image_pad|>",
         | 
| 227 | 
            +
                "<|video_pad|>"
         | 
| 228 | 
            +
              ],
         | 
| 229 | 
            +
              "bos_token": null,
         | 
| 230 | 
            +
              "clean_up_tokenization_spaces": false,
         | 
| 231 | 
            +
              "eos_token": "<|im_end|>",
         | 
| 232 | 
            +
              "errors": "replace",
         | 
| 233 | 
            +
              "extra_special_tokens": {},
         | 
| 234 | 
            +
              "model_max_length": 131072,
         | 
| 235 | 
            +
              "pad_token": "<|endoftext|>",
         | 
| 236 | 
            +
              "split_special_tokens": false,
         | 
| 237 | 
            +
              "tokenizer_class": "Qwen2Tokenizer",
         | 
| 238 | 
            +
              "unk_token": null
         | 
| 239 | 
            +
            }
         | 
    	
        checkpoint-500/trainer_state.json
    ADDED
    
    | @@ -0,0 +1,484 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "best_global_step": null,
         | 
| 3 | 
            +
              "best_metric": null,
         | 
| 4 | 
            +
              "best_model_checkpoint": null,
         | 
| 5 | 
            +
              "epoch": 2.6315789473684212,
         | 
| 6 | 
            +
              "eval_steps": 500,
         | 
| 7 | 
            +
              "global_step": 500,
         | 
| 8 | 
            +
              "is_hyper_param_search": false,
         | 
| 9 | 
            +
              "is_local_process_zero": true,
         | 
| 10 | 
            +
              "is_world_process_zero": true,
         | 
| 11 | 
            +
              "log_history": [
         | 
| 12 | 
            +
                {
         | 
| 13 | 
            +
                  "epoch": 0.05263157894736842,
         | 
| 14 | 
            +
                  "grad_norm": 5.827399730682373,
         | 
| 15 | 
            +
                  "learning_rate": 1.968421052631579e-05,
         | 
| 16 | 
            +
                  "loss": 1.8455,
         | 
| 17 | 
            +
                  "mean_token_accuracy": 0.5674484133720398,
         | 
| 18 | 
            +
                  "num_tokens": 11541.0,
         | 
| 19 | 
            +
                  "step": 10
         | 
| 20 | 
            +
                },
         | 
| 21 | 
            +
                {
         | 
| 22 | 
            +
                  "epoch": 0.10526315789473684,
         | 
| 23 | 
            +
                  "grad_norm": 4.475362300872803,
         | 
| 24 | 
            +
                  "learning_rate": 1.9333333333333333e-05,
         | 
| 25 | 
            +
                  "loss": 1.5323,
         | 
| 26 | 
            +
                  "mean_token_accuracy": 0.6094174444675445,
         | 
| 27 | 
            +
                  "num_tokens": 22649.0,
         | 
| 28 | 
            +
                  "step": 20
         | 
| 29 | 
            +
                },
         | 
| 30 | 
            +
                {
         | 
| 31 | 
            +
                  "epoch": 0.15789473684210525,
         | 
| 32 | 
            +
                  "grad_norm": 4.522463321685791,
         | 
| 33 | 
            +
                  "learning_rate": 1.898245614035088e-05,
         | 
| 34 | 
            +
                  "loss": 1.5459,
         | 
| 35 | 
            +
                  "mean_token_accuracy": 0.6038810610771179,
         | 
| 36 | 
            +
                  "num_tokens": 33775.0,
         | 
| 37 | 
            +
                  "step": 30
         | 
| 38 | 
            +
                },
         | 
| 39 | 
            +
                {
         | 
| 40 | 
            +
                  "epoch": 0.21052631578947367,
         | 
| 41 | 
            +
                  "grad_norm": 4.1712775230407715,
         | 
| 42 | 
            +
                  "learning_rate": 1.8631578947368424e-05,
         | 
| 43 | 
            +
                  "loss": 1.5167,
         | 
| 44 | 
            +
                  "mean_token_accuracy": 0.611203521490097,
         | 
| 45 | 
            +
                  "num_tokens": 45182.0,
         | 
| 46 | 
            +
                  "step": 40
         | 
| 47 | 
            +
                },
         | 
| 48 | 
            +
                {
         | 
| 49 | 
            +
                  "epoch": 0.2631578947368421,
         | 
| 50 | 
            +
                  "grad_norm": 4.325103759765625,
         | 
| 51 | 
            +
                  "learning_rate": 1.8280701754385966e-05,
         | 
| 52 | 
            +
                  "loss": 1.5465,
         | 
| 53 | 
            +
                  "mean_token_accuracy": 0.6018322885036469,
         | 
| 54 | 
            +
                  "num_tokens": 56080.0,
         | 
| 55 | 
            +
                  "step": 50
         | 
| 56 | 
            +
                },
         | 
| 57 | 
            +
                {
         | 
| 58 | 
            +
                  "epoch": 0.3157894736842105,
         | 
| 59 | 
            +
                  "grad_norm": 3.4913244247436523,
         | 
| 60 | 
            +
                  "learning_rate": 1.792982456140351e-05,
         | 
| 61 | 
            +
                  "loss": 1.4975,
         | 
| 62 | 
            +
                  "mean_token_accuracy": 0.6165669798851013,
         | 
| 63 | 
            +
                  "num_tokens": 67393.0,
         | 
| 64 | 
            +
                  "step": 60
         | 
| 65 | 
            +
                },
         | 
| 66 | 
            +
                {
         | 
| 67 | 
            +
                  "epoch": 0.3684210526315789,
         | 
| 68 | 
            +
                  "grad_norm": 3.657413959503174,
         | 
| 69 | 
            +
                  "learning_rate": 1.7578947368421054e-05,
         | 
| 70 | 
            +
                  "loss": 1.4742,
         | 
| 71 | 
            +
                  "mean_token_accuracy": 0.6143243193626404,
         | 
| 72 | 
            +
                  "num_tokens": 78542.0,
         | 
| 73 | 
            +
                  "step": 70
         | 
| 74 | 
            +
                },
         | 
| 75 | 
            +
                {
         | 
| 76 | 
            +
                  "epoch": 0.42105263157894735,
         | 
| 77 | 
            +
                  "grad_norm": 3.5924668312072754,
         | 
| 78 | 
            +
                  "learning_rate": 1.72280701754386e-05,
         | 
| 79 | 
            +
                  "loss": 1.5084,
         | 
| 80 | 
            +
                  "mean_token_accuracy": 0.608461570739746,
         | 
| 81 | 
            +
                  "num_tokens": 89963.0,
         | 
| 82 | 
            +
                  "step": 80
         | 
| 83 | 
            +
                },
         | 
| 84 | 
            +
                {
         | 
| 85 | 
            +
                  "epoch": 0.47368421052631576,
         | 
| 86 | 
            +
                  "grad_norm": 3.244269609451294,
         | 
| 87 | 
            +
                  "learning_rate": 1.687719298245614e-05,
         | 
| 88 | 
            +
                  "loss": 1.4668,
         | 
| 89 | 
            +
                  "mean_token_accuracy": 0.6215168476104737,
         | 
| 90 | 
            +
                  "num_tokens": 101761.0,
         | 
| 91 | 
            +
                  "step": 90
         | 
| 92 | 
            +
                },
         | 
| 93 | 
            +
                {
         | 
| 94 | 
            +
                  "epoch": 0.5263157894736842,
         | 
| 95 | 
            +
                  "grad_norm": 3.2805733680725098,
         | 
| 96 | 
            +
                  "learning_rate": 1.6526315789473686e-05,
         | 
| 97 | 
            +
                  "loss": 1.5016,
         | 
| 98 | 
            +
                  "mean_token_accuracy": 0.6094469010829926,
         | 
| 99 | 
            +
                  "num_tokens": 112833.0,
         | 
| 100 | 
            +
                  "step": 100
         | 
| 101 | 
            +
                },
         | 
| 102 | 
            +
                {
         | 
| 103 | 
            +
                  "epoch": 0.5789473684210527,
         | 
| 104 | 
            +
                  "grad_norm": 3.5279650688171387,
         | 
| 105 | 
            +
                  "learning_rate": 1.6175438596491228e-05,
         | 
| 106 | 
            +
                  "loss": 1.4591,
         | 
| 107 | 
            +
                  "mean_token_accuracy": 0.6197056353092194,
         | 
| 108 | 
            +
                  "num_tokens": 123982.0,
         | 
| 109 | 
            +
                  "step": 110
         | 
| 110 | 
            +
                },
         | 
| 111 | 
            +
                {
         | 
| 112 | 
            +
                  "epoch": 0.631578947368421,
         | 
| 113 | 
            +
                  "grad_norm": 3.697514057159424,
         | 
| 114 | 
            +
                  "learning_rate": 1.5824561403508774e-05,
         | 
| 115 | 
            +
                  "loss": 1.4903,
         | 
| 116 | 
            +
                  "mean_token_accuracy": 0.6157862544059753,
         | 
| 117 | 
            +
                  "num_tokens": 135281.0,
         | 
| 118 | 
            +
                  "step": 120
         | 
| 119 | 
            +
                },
         | 
| 120 | 
            +
                {
         | 
| 121 | 
            +
                  "epoch": 0.6842105263157895,
         | 
| 122 | 
            +
                  "grad_norm": 3.5749077796936035,
         | 
| 123 | 
            +
                  "learning_rate": 1.5473684210526316e-05,
         | 
| 124 | 
            +
                  "loss": 1.4905,
         | 
| 125 | 
            +
                  "mean_token_accuracy": 0.6141646564006805,
         | 
| 126 | 
            +
                  "num_tokens": 146813.0,
         | 
| 127 | 
            +
                  "step": 130
         | 
| 128 | 
            +
                },
         | 
| 129 | 
            +
                {
         | 
| 130 | 
            +
                  "epoch": 0.7368421052631579,
         | 
| 131 | 
            +
                  "grad_norm": 3.9152824878692627,
         | 
| 132 | 
            +
                  "learning_rate": 1.5122807017543861e-05,
         | 
| 133 | 
            +
                  "loss": 1.4288,
         | 
| 134 | 
            +
                  "mean_token_accuracy": 0.6200086832046509,
         | 
| 135 | 
            +
                  "num_tokens": 157970.0,
         | 
| 136 | 
            +
                  "step": 140
         | 
| 137 | 
            +
                },
         | 
| 138 | 
            +
                {
         | 
| 139 | 
            +
                  "epoch": 0.7894736842105263,
         | 
| 140 | 
            +
                  "grad_norm": 3.2869362831115723,
         | 
| 141 | 
            +
                  "learning_rate": 1.4771929824561405e-05,
         | 
| 142 | 
            +
                  "loss": 1.4484,
         | 
| 143 | 
            +
                  "mean_token_accuracy": 0.6214825510978699,
         | 
| 144 | 
            +
                  "num_tokens": 169217.0,
         | 
| 145 | 
            +
                  "step": 150
         | 
| 146 | 
            +
                },
         | 
| 147 | 
            +
                {
         | 
| 148 | 
            +
                  "epoch": 0.8421052631578947,
         | 
| 149 | 
            +
                  "grad_norm": 3.530872106552124,
         | 
| 150 | 
            +
                  "learning_rate": 1.4421052631578948e-05,
         | 
| 151 | 
            +
                  "loss": 1.4638,
         | 
| 152 | 
            +
                  "mean_token_accuracy": 0.6164202988147736,
         | 
| 153 | 
            +
                  "num_tokens": 180220.0,
         | 
| 154 | 
            +
                  "step": 160
         | 
| 155 | 
            +
                },
         | 
| 156 | 
            +
                {
         | 
| 157 | 
            +
                  "epoch": 0.8947368421052632,
         | 
| 158 | 
            +
                  "grad_norm": 3.1449363231658936,
         | 
| 159 | 
            +
                  "learning_rate": 1.4070175438596492e-05,
         | 
| 160 | 
            +
                  "loss": 1.4679,
         | 
| 161 | 
            +
                  "mean_token_accuracy": 0.6190443515777588,
         | 
| 162 | 
            +
                  "num_tokens": 192002.0,
         | 
| 163 | 
            +
                  "step": 170
         | 
| 164 | 
            +
                },
         | 
| 165 | 
            +
                {
         | 
| 166 | 
            +
                  "epoch": 0.9473684210526315,
         | 
| 167 | 
            +
                  "grad_norm": 3.275167942047119,
         | 
| 168 | 
            +
                  "learning_rate": 1.3719298245614037e-05,
         | 
| 169 | 
            +
                  "loss": 1.421,
         | 
| 170 | 
            +
                  "mean_token_accuracy": 0.6280941605567932,
         | 
| 171 | 
            +
                  "num_tokens": 203462.0,
         | 
| 172 | 
            +
                  "step": 180
         | 
| 173 | 
            +
                },
         | 
| 174 | 
            +
                {
         | 
| 175 | 
            +
                  "epoch": 1.0,
         | 
| 176 | 
            +
                  "grad_norm": 4.7444939613342285,
         | 
| 177 | 
            +
                  "learning_rate": 1.336842105263158e-05,
         | 
| 178 | 
            +
                  "loss": 1.4532,
         | 
| 179 | 
            +
                  "mean_token_accuracy": 0.6205313920974731,
         | 
| 180 | 
            +
                  "num_tokens": 214070.0,
         | 
| 181 | 
            +
                  "step": 190
         | 
| 182 | 
            +
                },
         | 
| 183 | 
            +
                {
         | 
| 184 | 
            +
                  "epoch": 1.0526315789473684,
         | 
| 185 | 
            +
                  "grad_norm": 3.492197275161743,
         | 
| 186 | 
            +
                  "learning_rate": 1.3017543859649125e-05,
         | 
| 187 | 
            +
                  "loss": 0.8224,
         | 
| 188 | 
            +
                  "mean_token_accuracy": 0.7620033979415893,
         | 
| 189 | 
            +
                  "num_tokens": 225485.0,
         | 
| 190 | 
            +
                  "step": 200
         | 
| 191 | 
            +
                },
         | 
| 192 | 
            +
                {
         | 
| 193 | 
            +
                  "epoch": 1.1052631578947367,
         | 
| 194 | 
            +
                  "grad_norm": 2.8766427040100098,
         | 
| 195 | 
            +
                  "learning_rate": 1.2666666666666667e-05,
         | 
| 196 | 
            +
                  "loss": 0.7886,
         | 
| 197 | 
            +
                  "mean_token_accuracy": 0.7641067266464233,
         | 
| 198 | 
            +
                  "num_tokens": 236739.0,
         | 
| 199 | 
            +
                  "step": 210
         | 
| 200 | 
            +
                },
         | 
| 201 | 
            +
                {
         | 
| 202 | 
            +
                  "epoch": 1.1578947368421053,
         | 
| 203 | 
            +
                  "grad_norm": 3.0320887565612793,
         | 
| 204 | 
            +
                  "learning_rate": 1.2315789473684212e-05,
         | 
| 205 | 
            +
                  "loss": 0.7586,
         | 
| 206 | 
            +
                  "mean_token_accuracy": 0.7756024956703186,
         | 
| 207 | 
            +
                  "num_tokens": 248217.0,
         | 
| 208 | 
            +
                  "step": 220
         | 
| 209 | 
            +
                },
         | 
| 210 | 
            +
                {
         | 
| 211 | 
            +
                  "epoch": 1.2105263157894737,
         | 
| 212 | 
            +
                  "grad_norm": 3.0771596431732178,
         | 
| 213 | 
            +
                  "learning_rate": 1.1964912280701754e-05,
         | 
| 214 | 
            +
                  "loss": 0.8016,
         | 
| 215 | 
            +
                  "mean_token_accuracy": 0.7638419806957245,
         | 
| 216 | 
            +
                  "num_tokens": 259404.0,
         | 
| 217 | 
            +
                  "step": 230
         | 
| 218 | 
            +
                },
         | 
| 219 | 
            +
                {
         | 
| 220 | 
            +
                  "epoch": 1.263157894736842,
         | 
| 221 | 
            +
                  "grad_norm": 3.5975608825683594,
         | 
| 222 | 
            +
                  "learning_rate": 1.16140350877193e-05,
         | 
| 223 | 
            +
                  "loss": 0.7888,
         | 
| 224 | 
            +
                  "mean_token_accuracy": 0.7639639139175415,
         | 
| 225 | 
            +
                  "num_tokens": 270555.0,
         | 
| 226 | 
            +
                  "step": 240
         | 
| 227 | 
            +
                },
         | 
| 228 | 
            +
                {
         | 
| 229 | 
            +
                  "epoch": 1.3157894736842106,
         | 
| 230 | 
            +
                  "grad_norm": 3.4842450618743896,
         | 
| 231 | 
            +
                  "learning_rate": 1.1263157894736843e-05,
         | 
| 232 | 
            +
                  "loss": 0.7854,
         | 
| 233 | 
            +
                  "mean_token_accuracy": 0.7633719563484191,
         | 
| 234 | 
            +
                  "num_tokens": 281976.0,
         | 
| 235 | 
            +
                  "step": 250
         | 
| 236 | 
            +
                },
         | 
| 237 | 
            +
                {
         | 
| 238 | 
            +
                  "epoch": 1.368421052631579,
         | 
| 239 | 
            +
                  "grad_norm": 3.1366794109344482,
         | 
| 240 | 
            +
                  "learning_rate": 1.0912280701754387e-05,
         | 
| 241 | 
            +
                  "loss": 0.7904,
         | 
| 242 | 
            +
                  "mean_token_accuracy": 0.7618038773536682,
         | 
| 243 | 
            +
                  "num_tokens": 293356.0,
         | 
| 244 | 
            +
                  "step": 260
         | 
| 245 | 
            +
                },
         | 
| 246 | 
            +
                {
         | 
| 247 | 
            +
                  "epoch": 1.4210526315789473,
         | 
| 248 | 
            +
                  "grad_norm": 3.246885299682617,
         | 
| 249 | 
            +
                  "learning_rate": 1.056140350877193e-05,
         | 
| 250 | 
            +
                  "loss": 0.7745,
         | 
| 251 | 
            +
                  "mean_token_accuracy": 0.7625904023647309,
         | 
| 252 | 
            +
                  "num_tokens": 304686.0,
         | 
| 253 | 
            +
                  "step": 270
         | 
| 254 | 
            +
                },
         | 
| 255 | 
            +
                {
         | 
| 256 | 
            +
                  "epoch": 1.4736842105263157,
         | 
| 257 | 
            +
                  "grad_norm": 3.398383140563965,
         | 
| 258 | 
            +
                  "learning_rate": 1.0210526315789476e-05,
         | 
| 259 | 
            +
                  "loss": 0.8135,
         | 
| 260 | 
            +
                  "mean_token_accuracy": 0.7576355338096619,
         | 
| 261 | 
            +
                  "num_tokens": 315735.0,
         | 
| 262 | 
            +
                  "step": 280
         | 
| 263 | 
            +
                },
         | 
| 264 | 
            +
                {
         | 
| 265 | 
            +
                  "epoch": 1.526315789473684,
         | 
| 266 | 
            +
                  "grad_norm": 3.1334409713745117,
         | 
| 267 | 
            +
                  "learning_rate": 9.859649122807018e-06,
         | 
| 268 | 
            +
                  "loss": 0.8007,
         | 
| 269 | 
            +
                  "mean_token_accuracy": 0.7634194076061249,
         | 
| 270 | 
            +
                  "num_tokens": 326982.0,
         | 
| 271 | 
            +
                  "step": 290
         | 
| 272 | 
            +
                },
         | 
| 273 | 
            +
                {
         | 
| 274 | 
            +
                  "epoch": 1.5789473684210527,
         | 
| 275 | 
            +
                  "grad_norm": 3.123152017593384,
         | 
| 276 | 
            +
                  "learning_rate": 9.508771929824562e-06,
         | 
| 277 | 
            +
                  "loss": 0.7878,
         | 
| 278 | 
            +
                  "mean_token_accuracy": 0.7639511227607727,
         | 
| 279 | 
            +
                  "num_tokens": 338160.0,
         | 
| 280 | 
            +
                  "step": 300
         | 
| 281 | 
            +
                },
         | 
| 282 | 
            +
                {
         | 
| 283 | 
            +
                  "epoch": 1.631578947368421,
         | 
| 284 | 
            +
                  "grad_norm": 3.388254404067993,
         | 
| 285 | 
            +
                  "learning_rate": 9.157894736842105e-06,
         | 
| 286 | 
            +
                  "loss": 0.7935,
         | 
| 287 | 
            +
                  "mean_token_accuracy": 0.7603401899337768,
         | 
| 288 | 
            +
                  "num_tokens": 349306.0,
         | 
| 289 | 
            +
                  "step": 310
         | 
| 290 | 
            +
                },
         | 
| 291 | 
            +
                {
         | 
| 292 | 
            +
                  "epoch": 1.6842105263157894,
         | 
| 293 | 
            +
                  "grad_norm": 3.4574532508850098,
         | 
| 294 | 
            +
                  "learning_rate": 8.807017543859649e-06,
         | 
| 295 | 
            +
                  "loss": 0.7912,
         | 
| 296 | 
            +
                  "mean_token_accuracy": 0.759734469652176,
         | 
| 297 | 
            +
                  "num_tokens": 360346.0,
         | 
| 298 | 
            +
                  "step": 320
         | 
| 299 | 
            +
                },
         | 
| 300 | 
            +
                {
         | 
| 301 | 
            +
                  "epoch": 1.736842105263158,
         | 
| 302 | 
            +
                  "grad_norm": 2.8878493309020996,
         | 
| 303 | 
            +
                  "learning_rate": 8.456140350877193e-06,
         | 
| 304 | 
            +
                  "loss": 0.7664,
         | 
| 305 | 
            +
                  "mean_token_accuracy": 0.7666481971740723,
         | 
| 306 | 
            +
                  "num_tokens": 371823.0,
         | 
| 307 | 
            +
                  "step": 330
         | 
| 308 | 
            +
                },
         | 
| 309 | 
            +
                {
         | 
| 310 | 
            +
                  "epoch": 1.7894736842105263,
         | 
| 311 | 
            +
                  "grad_norm": 3.632908582687378,
         | 
| 312 | 
            +
                  "learning_rate": 8.105263157894736e-06,
         | 
| 313 | 
            +
                  "loss": 0.7792,
         | 
| 314 | 
            +
                  "mean_token_accuracy": 0.7634695649147034,
         | 
| 315 | 
            +
                  "num_tokens": 383113.0,
         | 
| 316 | 
            +
                  "step": 340
         | 
| 317 | 
            +
                },
         | 
| 318 | 
            +
                {
         | 
| 319 | 
            +
                  "epoch": 1.8421052631578947,
         | 
| 320 | 
            +
                  "grad_norm": 3.0787177085876465,
         | 
| 321 | 
            +
                  "learning_rate": 7.754385964912282e-06,
         | 
| 322 | 
            +
                  "loss": 0.7973,
         | 
| 323 | 
            +
                  "mean_token_accuracy": 0.759224247932434,
         | 
| 324 | 
            +
                  "num_tokens": 394349.0,
         | 
| 325 | 
            +
                  "step": 350
         | 
| 326 | 
            +
                },
         | 
| 327 | 
            +
                {
         | 
| 328 | 
            +
                  "epoch": 1.8947368421052633,
         | 
| 329 | 
            +
                  "grad_norm": 3.434131622314453,
         | 
| 330 | 
            +
                  "learning_rate": 7.4035087719298246e-06,
         | 
| 331 | 
            +
                  "loss": 0.7968,
         | 
| 332 | 
            +
                  "mean_token_accuracy": 0.7612445116043091,
         | 
| 333 | 
            +
                  "num_tokens": 405857.0,
         | 
| 334 | 
            +
                  "step": 360
         | 
| 335 | 
            +
                },
         | 
| 336 | 
            +
                {
         | 
| 337 | 
            +
                  "epoch": 1.9473684210526314,
         | 
| 338 | 
            +
                  "grad_norm": 4.3988871574401855,
         | 
| 339 | 
            +
                  "learning_rate": 7.052631578947369e-06,
         | 
| 340 | 
            +
                  "loss": 0.7902,
         | 
| 341 | 
            +
                  "mean_token_accuracy": 0.7641420245170594,
         | 
| 342 | 
            +
                  "num_tokens": 416942.0,
         | 
| 343 | 
            +
                  "step": 370
         | 
| 344 | 
            +
                },
         | 
| 345 | 
            +
                {
         | 
| 346 | 
            +
                  "epoch": 2.0,
         | 
| 347 | 
            +
                  "grad_norm": 4.871621131896973,
         | 
| 348 | 
            +
                  "learning_rate": 6.701754385964913e-06,
         | 
| 349 | 
            +
                  "loss": 0.7826,
         | 
| 350 | 
            +
                  "mean_token_accuracy": 0.763578999042511,
         | 
| 351 | 
            +
                  "num_tokens": 428140.0,
         | 
| 352 | 
            +
                  "step": 380
         | 
| 353 | 
            +
                },
         | 
| 354 | 
            +
                {
         | 
| 355 | 
            +
                  "epoch": 2.0526315789473686,
         | 
| 356 | 
            +
                  "grad_norm": 3.33593487739563,
         | 
| 357 | 
            +
                  "learning_rate": 6.3508771929824565e-06,
         | 
| 358 | 
            +
                  "loss": 0.3547,
         | 
| 359 | 
            +
                  "mean_token_accuracy": 0.8930943846702576,
         | 
| 360 | 
            +
                  "num_tokens": 439280.0,
         | 
| 361 | 
            +
                  "step": 390
         | 
| 362 | 
            +
                },
         | 
| 363 | 
            +
                {
         | 
| 364 | 
            +
                  "epoch": 2.1052631578947367,
         | 
| 365 | 
            +
                  "grad_norm": 3.4659924507141113,
         | 
| 366 | 
            +
                  "learning_rate": 6e-06,
         | 
| 367 | 
            +
                  "loss": 0.3274,
         | 
| 368 | 
            +
                  "mean_token_accuracy": 0.8942540287971497,
         | 
| 369 | 
            +
                  "num_tokens": 450443.0,
         | 
| 370 | 
            +
                  "step": 400
         | 
| 371 | 
            +
                },
         | 
| 372 | 
            +
                {
         | 
| 373 | 
            +
                  "epoch": 2.1578947368421053,
         | 
| 374 | 
            +
                  "grad_norm": 3.334516763687134,
         | 
| 375 | 
            +
                  "learning_rate": 5.649122807017544e-06,
         | 
| 376 | 
            +
                  "loss": 0.2965,
         | 
| 377 | 
            +
                  "mean_token_accuracy": 0.9032998859882355,
         | 
| 378 | 
            +
                  "num_tokens": 461774.0,
         | 
| 379 | 
            +
                  "step": 410
         | 
| 380 | 
            +
                },
         | 
| 381 | 
            +
                {
         | 
| 382 | 
            +
                  "epoch": 2.2105263157894735,
         | 
| 383 | 
            +
                  "grad_norm": 3.0437450408935547,
         | 
| 384 | 
            +
                  "learning_rate": 5.298245614035088e-06,
         | 
| 385 | 
            +
                  "loss": 0.2991,
         | 
| 386 | 
            +
                  "mean_token_accuracy": 0.9052271246910095,
         | 
| 387 | 
            +
                  "num_tokens": 473068.0,
         | 
| 388 | 
            +
                  "step": 420
         | 
| 389 | 
            +
                },
         | 
| 390 | 
            +
                {
         | 
| 391 | 
            +
                  "epoch": 2.263157894736842,
         | 
| 392 | 
            +
                  "grad_norm": 3.4685380458831787,
         | 
| 393 | 
            +
                  "learning_rate": 4.947368421052632e-06,
         | 
| 394 | 
            +
                  "loss": 0.3096,
         | 
| 395 | 
            +
                  "mean_token_accuracy": 0.900404566526413,
         | 
| 396 | 
            +
                  "num_tokens": 484122.0,
         | 
| 397 | 
            +
                  "step": 430
         | 
| 398 | 
            +
                },
         | 
| 399 | 
            +
                {
         | 
| 400 | 
            +
                  "epoch": 2.3157894736842106,
         | 
| 401 | 
            +
                  "grad_norm": 3.8948161602020264,
         | 
| 402 | 
            +
                  "learning_rate": 4.596491228070176e-06,
         | 
| 403 | 
            +
                  "loss": 0.2997,
         | 
| 404 | 
            +
                  "mean_token_accuracy": 0.901084166765213,
         | 
| 405 | 
            +
                  "num_tokens": 495492.0,
         | 
| 406 | 
            +
                  "step": 440
         | 
| 407 | 
            +
                },
         | 
| 408 | 
            +
                {
         | 
| 409 | 
            +
                  "epoch": 2.3684210526315788,
         | 
| 410 | 
            +
                  "grad_norm": 3.75742506980896,
         | 
| 411 | 
            +
                  "learning_rate": 4.245614035087719e-06,
         | 
| 412 | 
            +
                  "loss": 0.3066,
         | 
| 413 | 
            +
                  "mean_token_accuracy": 0.901238763332367,
         | 
| 414 | 
            +
                  "num_tokens": 507138.0,
         | 
| 415 | 
            +
                  "step": 450
         | 
| 416 | 
            +
                },
         | 
| 417 | 
            +
                {
         | 
| 418 | 
            +
                  "epoch": 2.4210526315789473,
         | 
| 419 | 
            +
                  "grad_norm": 3.1433157920837402,
         | 
| 420 | 
            +
                  "learning_rate": 3.894736842105263e-06,
         | 
| 421 | 
            +
                  "loss": 0.3153,
         | 
| 422 | 
            +
                  "mean_token_accuracy": 0.899943882226944,
         | 
| 423 | 
            +
                  "num_tokens": 518246.0,
         | 
| 424 | 
            +
                  "step": 460
         | 
| 425 | 
            +
                },
         | 
| 426 | 
            +
                {
         | 
| 427 | 
            +
                  "epoch": 2.473684210526316,
         | 
| 428 | 
            +
                  "grad_norm": 3.025437831878662,
         | 
| 429 | 
            +
                  "learning_rate": 3.543859649122807e-06,
         | 
| 430 | 
            +
                  "loss": 0.2975,
         | 
| 431 | 
            +
                  "mean_token_accuracy": 0.9012304186820984,
         | 
| 432 | 
            +
                  "num_tokens": 529617.0,
         | 
| 433 | 
            +
                  "step": 470
         | 
| 434 | 
            +
                },
         | 
| 435 | 
            +
                {
         | 
| 436 | 
            +
                  "epoch": 2.526315789473684,
         | 
| 437 | 
            +
                  "grad_norm": 3.873894453048706,
         | 
| 438 | 
            +
                  "learning_rate": 3.192982456140351e-06,
         | 
| 439 | 
            +
                  "loss": 0.2858,
         | 
| 440 | 
            +
                  "mean_token_accuracy": 0.9065197288990021,
         | 
| 441 | 
            +
                  "num_tokens": 540576.0,
         | 
| 442 | 
            +
                  "step": 480
         | 
| 443 | 
            +
                },
         | 
| 444 | 
            +
                {
         | 
| 445 | 
            +
                  "epoch": 2.5789473684210527,
         | 
| 446 | 
            +
                  "grad_norm": 3.2669241428375244,
         | 
| 447 | 
            +
                  "learning_rate": 2.842105263157895e-06,
         | 
| 448 | 
            +
                  "loss": 0.2911,
         | 
| 449 | 
            +
                  "mean_token_accuracy": 0.9069616377353669,
         | 
| 450 | 
            +
                  "num_tokens": 552038.0,
         | 
| 451 | 
            +
                  "step": 490
         | 
| 452 | 
            +
                },
         | 
| 453 | 
            +
                {
         | 
| 454 | 
            +
                  "epoch": 2.6315789473684212,
         | 
| 455 | 
            +
                  "grad_norm": 3.070552110671997,
         | 
| 456 | 
            +
                  "learning_rate": 2.4912280701754387e-06,
         | 
| 457 | 
            +
                  "loss": 0.2875,
         | 
| 458 | 
            +
                  "mean_token_accuracy": 0.9050689458847045,
         | 
| 459 | 
            +
                  "num_tokens": 563396.0,
         | 
| 460 | 
            +
                  "step": 500
         | 
| 461 | 
            +
                }
         | 
| 462 | 
            +
              ],
         | 
| 463 | 
            +
              "logging_steps": 10,
         | 
| 464 | 
            +
              "max_steps": 570,
         | 
| 465 | 
            +
              "num_input_tokens_seen": 0,
         | 
| 466 | 
            +
              "num_train_epochs": 3,
         | 
| 467 | 
            +
              "save_steps": 500,
         | 
| 468 | 
            +
              "stateful_callbacks": {
         | 
| 469 | 
            +
                "TrainerControl": {
         | 
| 470 | 
            +
                  "args": {
         | 
| 471 | 
            +
                    "should_epoch_stop": false,
         | 
| 472 | 
            +
                    "should_evaluate": false,
         | 
| 473 | 
            +
                    "should_log": false,
         | 
| 474 | 
            +
                    "should_save": true,
         | 
| 475 | 
            +
                    "should_training_stop": false
         | 
| 476 | 
            +
                  },
         | 
| 477 | 
            +
                  "attributes": {}
         | 
| 478 | 
            +
                }
         | 
| 479 | 
            +
              },
         | 
| 480 | 
            +
              "total_flos": 3.394575181929677e+16,
         | 
| 481 | 
            +
              "train_batch_size": 8,
         | 
| 482 | 
            +
              "trial_name": null,
         | 
| 483 | 
            +
              "trial_params": null
         | 
| 484 | 
            +
            }
         | 
    	
        checkpoint-500/training_args.bin
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:fffaee687bcf434da66f399553b6e3a647d1ffa26a027aa9caf4841c6cfe086c
         | 
| 3 | 
            +
            size 6161
         | 
    	
        checkpoint-500/vocab.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        checkpoint-570/added_tokens.json
    ADDED
    
    | @@ -0,0 +1,28 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "</think>": 151668,
         | 
| 3 | 
            +
              "</tool_call>": 151658,
         | 
| 4 | 
            +
              "</tool_response>": 151666,
         | 
| 5 | 
            +
              "<think>": 151667,
         | 
| 6 | 
            +
              "<tool_call>": 151657,
         | 
| 7 | 
            +
              "<tool_response>": 151665,
         | 
| 8 | 
            +
              "<|box_end|>": 151649,
         | 
| 9 | 
            +
              "<|box_start|>": 151648,
         | 
| 10 | 
            +
              "<|endoftext|>": 151643,
         | 
| 11 | 
            +
              "<|file_sep|>": 151664,
         | 
| 12 | 
            +
              "<|fim_middle|>": 151660,
         | 
| 13 | 
            +
              "<|fim_pad|>": 151662,
         | 
| 14 | 
            +
              "<|fim_prefix|>": 151659,
         | 
| 15 | 
            +
              "<|fim_suffix|>": 151661,
         | 
| 16 | 
            +
              "<|im_end|>": 151645,
         | 
| 17 | 
            +
              "<|im_start|>": 151644,
         | 
| 18 | 
            +
              "<|image_pad|>": 151655,
         | 
| 19 | 
            +
              "<|object_ref_end|>": 151647,
         | 
| 20 | 
            +
              "<|object_ref_start|>": 151646,
         | 
| 21 | 
            +
              "<|quad_end|>": 151651,
         | 
| 22 | 
            +
              "<|quad_start|>": 151650,
         | 
| 23 | 
            +
              "<|repo_name|>": 151663,
         | 
| 24 | 
            +
              "<|video_pad|>": 151656,
         | 
| 25 | 
            +
              "<|vision_end|>": 151653,
         | 
| 26 | 
            +
              "<|vision_pad|>": 151654,
         | 
| 27 | 
            +
              "<|vision_start|>": 151652
         | 
| 28 | 
            +
            }
         | 
    	
        checkpoint-570/chat_template.jinja
    ADDED
    
    | @@ -0,0 +1,89 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {%- if tools %}
         | 
| 2 | 
            +
                {{- '<|im_start|>system\n' }}
         | 
| 3 | 
            +
                {%- if messages[0].role == 'system' %}
         | 
| 4 | 
            +
                    {{- messages[0].content + '\n\n' }}
         | 
| 5 | 
            +
                {%- endif %}
         | 
| 6 | 
            +
                {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
         | 
| 7 | 
            +
                {%- for tool in tools %}
         | 
| 8 | 
            +
                    {{- "\n" }}
         | 
| 9 | 
            +
                    {{- tool | tojson }}
         | 
| 10 | 
            +
                {%- endfor %}
         | 
| 11 | 
            +
                {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
         | 
| 12 | 
            +
            {%- else %}
         | 
| 13 | 
            +
                {%- if messages[0].role == 'system' %}
         | 
| 14 | 
            +
                    {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
         | 
| 15 | 
            +
                {%- endif %}
         | 
| 16 | 
            +
            {%- endif %}
         | 
| 17 | 
            +
            {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
         | 
| 18 | 
            +
            {%- for message in messages[::-1] %}
         | 
| 19 | 
            +
                {%- set index = (messages|length - 1) - loop.index0 %}
         | 
| 20 | 
            +
                {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
         | 
| 21 | 
            +
                    {%- set ns.multi_step_tool = false %}
         | 
| 22 | 
            +
                    {%- set ns.last_query_index = index %}
         | 
| 23 | 
            +
                {%- endif %}
         | 
| 24 | 
            +
            {%- endfor %}
         | 
| 25 | 
            +
            {%- for message in messages %}
         | 
| 26 | 
            +
                {%- if message.content is string %}
         | 
| 27 | 
            +
                    {%- set content = message.content %}
         | 
| 28 | 
            +
                {%- else %}
         | 
| 29 | 
            +
                    {%- set content = '' %}
         | 
| 30 | 
            +
                {%- endif %}
         | 
| 31 | 
            +
                {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
         | 
| 32 | 
            +
                    {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
         | 
| 33 | 
            +
                {%- elif message.role == "assistant" %}
         | 
| 34 | 
            +
                    {%- set reasoning_content = '' %}
         | 
| 35 | 
            +
                    {%- if message.reasoning_content is string %}
         | 
| 36 | 
            +
                        {%- set reasoning_content = message.reasoning_content %}
         | 
| 37 | 
            +
                    {%- else %}
         | 
| 38 | 
            +
                        {%- if '</think>' in content %}
         | 
| 39 | 
            +
                            {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
         | 
| 40 | 
            +
                            {%- set content = content.split('</think>')[-1].lstrip('\n') %}
         | 
| 41 | 
            +
                        {%- endif %}
         | 
| 42 | 
            +
                    {%- endif %}
         | 
| 43 | 
            +
                    {%- if loop.index0 > ns.last_query_index %}
         | 
| 44 | 
            +
                        {%- if loop.last or (not loop.last and reasoning_content) %}
         | 
| 45 | 
            +
                            {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
         | 
| 46 | 
            +
                        {%- else %}
         | 
| 47 | 
            +
                            {{- '<|im_start|>' + message.role + '\n' + content }}
         | 
| 48 | 
            +
                        {%- endif %}
         | 
| 49 | 
            +
                    {%- else %}
         | 
| 50 | 
            +
                        {{- '<|im_start|>' + message.role + '\n' + content }}
         | 
| 51 | 
            +
                    {%- endif %}
         | 
| 52 | 
            +
                    {%- if message.tool_calls %}
         | 
| 53 | 
            +
                        {%- for tool_call in message.tool_calls %}
         | 
| 54 | 
            +
                            {%- if (loop.first and content) or (not loop.first) %}
         | 
| 55 | 
            +
                                {{- '\n' }}
         | 
| 56 | 
            +
                            {%- endif %}
         | 
| 57 | 
            +
                            {%- if tool_call.function %}
         | 
| 58 | 
            +
                                {%- set tool_call = tool_call.function %}
         | 
| 59 | 
            +
                            {%- endif %}
         | 
| 60 | 
            +
                            {{- '<tool_call>\n{"name": "' }}
         | 
| 61 | 
            +
                            {{- tool_call.name }}
         | 
| 62 | 
            +
                            {{- '", "arguments": ' }}
         | 
| 63 | 
            +
                            {%- if tool_call.arguments is string %}
         | 
| 64 | 
            +
                                {{- tool_call.arguments }}
         | 
| 65 | 
            +
                            {%- else %}
         | 
| 66 | 
            +
                                {{- tool_call.arguments | tojson }}
         | 
| 67 | 
            +
                            {%- endif %}
         | 
| 68 | 
            +
                            {{- '}\n</tool_call>' }}
         | 
| 69 | 
            +
                        {%- endfor %}
         | 
| 70 | 
            +
                    {%- endif %}
         | 
| 71 | 
            +
                    {{- '<|im_end|>\n' }}
         | 
| 72 | 
            +
                {%- elif message.role == "tool" %}
         | 
| 73 | 
            +
                    {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
         | 
| 74 | 
            +
                        {{- '<|im_start|>user' }}
         | 
| 75 | 
            +
                    {%- endif %}
         | 
| 76 | 
            +
                    {{- '\n<tool_response>\n' }}
         | 
| 77 | 
            +
                    {{- content }}
         | 
| 78 | 
            +
                    {{- '\n</tool_response>' }}
         | 
| 79 | 
            +
                    {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
         | 
| 80 | 
            +
                        {{- '<|im_end|>\n' }}
         | 
| 81 | 
            +
                    {%- endif %}
         | 
| 82 | 
            +
                {%- endif %}
         | 
| 83 | 
            +
            {%- endfor %}
         | 
| 84 | 
            +
            {%- if add_generation_prompt %}
         | 
| 85 | 
            +
                {{- '<|im_start|>assistant\n' }}
         | 
| 86 | 
            +
                {%- if enable_thinking is defined and enable_thinking is false %}
         | 
| 87 | 
            +
                    {{- '<think>\n\n</think>\n\n' }}
         | 
| 88 | 
            +
                {%- endif %}
         | 
| 89 | 
            +
            {%- endif %}
         | 
    	
        checkpoint-570/config.json
    ADDED
    
    | @@ -0,0 +1,68 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "architectures": [
         | 
| 3 | 
            +
                "Qwen3ForCausalLM"
         | 
| 4 | 
            +
              ],
         | 
| 5 | 
            +
              "attention_bias": false,
         | 
| 6 | 
            +
              "attention_dropout": 0.0,
         | 
| 7 | 
            +
              "bos_token_id": 151643,
         | 
| 8 | 
            +
              "eos_token_id": 151645,
         | 
| 9 | 
            +
              "head_dim": 128,
         | 
| 10 | 
            +
              "hidden_act": "silu",
         | 
| 11 | 
            +
              "hidden_size": 4096,
         | 
| 12 | 
            +
              "initializer_range": 0.02,
         | 
| 13 | 
            +
              "intermediate_size": 12288,
         | 
| 14 | 
            +
              "layer_types": [
         | 
| 15 | 
            +
                "full_attention",
         | 
| 16 | 
            +
                "full_attention",
         | 
| 17 | 
            +
                "full_attention",
         | 
| 18 | 
            +
                "full_attention",
         | 
| 19 | 
            +
                "full_attention",
         | 
| 20 | 
            +
                "full_attention",
         | 
| 21 | 
            +
                "full_attention",
         | 
| 22 | 
            +
                "full_attention",
         | 
| 23 | 
            +
                "full_attention",
         | 
| 24 | 
            +
                "full_attention",
         | 
| 25 | 
            +
                "full_attention",
         | 
| 26 | 
            +
                "full_attention",
         | 
| 27 | 
            +
                "full_attention",
         | 
| 28 | 
            +
                "full_attention",
         | 
| 29 | 
            +
                "full_attention",
         | 
| 30 | 
            +
                "full_attention",
         | 
| 31 | 
            +
                "full_attention",
         | 
| 32 | 
            +
                "full_attention",
         | 
| 33 | 
            +
                "full_attention",
         | 
| 34 | 
            +
                "full_attention",
         | 
| 35 | 
            +
                "full_attention",
         | 
| 36 | 
            +
                "full_attention",
         | 
| 37 | 
            +
                "full_attention",
         | 
| 38 | 
            +
                "full_attention",
         | 
| 39 | 
            +
                "full_attention",
         | 
| 40 | 
            +
                "full_attention",
         | 
| 41 | 
            +
                "full_attention",
         | 
| 42 | 
            +
                "full_attention",
         | 
| 43 | 
            +
                "full_attention",
         | 
| 44 | 
            +
                "full_attention",
         | 
| 45 | 
            +
                "full_attention",
         | 
| 46 | 
            +
                "full_attention",
         | 
| 47 | 
            +
                "full_attention",
         | 
| 48 | 
            +
                "full_attention",
         | 
| 49 | 
            +
                "full_attention",
         | 
| 50 | 
            +
                "full_attention"
         | 
| 51 | 
            +
              ],
         | 
| 52 | 
            +
              "max_position_embeddings": 40960,
         | 
| 53 | 
            +
              "max_window_layers": 36,
         | 
| 54 | 
            +
              "model_type": "qwen3",
         | 
| 55 | 
            +
              "num_attention_heads": 32,
         | 
| 56 | 
            +
              "num_hidden_layers": 36,
         | 
| 57 | 
            +
              "num_key_value_heads": 8,
         | 
| 58 | 
            +
              "rms_norm_eps": 1e-06,
         | 
| 59 | 
            +
              "rope_scaling": null,
         | 
| 60 | 
            +
              "rope_theta": 1000000,
         | 
| 61 | 
            +
              "sliding_window": null,
         | 
| 62 | 
            +
              "tie_word_embeddings": false,
         | 
| 63 | 
            +
              "torch_dtype": "float32",
         | 
| 64 | 
            +
              "transformers_version": "4.53.0",
         | 
| 65 | 
            +
              "use_cache": true,
         | 
| 66 | 
            +
              "use_sliding_window": false,
         | 
| 67 | 
            +
              "vocab_size": 151936
         | 
| 68 | 
            +
            }
         | 
    	
        checkpoint-570/generation_config.json
    ADDED
    
    | @@ -0,0 +1,13 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "bos_token_id": 151643,
         | 
| 3 | 
            +
              "do_sample": true,
         | 
| 4 | 
            +
              "eos_token_id": [
         | 
| 5 | 
            +
                151645,
         | 
| 6 | 
            +
                151643
         | 
| 7 | 
            +
              ],
         | 
| 8 | 
            +
              "pad_token_id": 151643,
         | 
| 9 | 
            +
              "temperature": 0.6,
         | 
| 10 | 
            +
              "top_k": 20,
         | 
| 11 | 
            +
              "top_p": 0.95,
         | 
| 12 | 
            +
              "transformers_version": "4.53.0"
         | 
| 13 | 
            +
            }
         | 
    	
        checkpoint-570/merges.txt
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        checkpoint-570/model-00001-of-00007.safetensors
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:2377228dac033fa3ab97988d1c941a87c54a9374043c8325613c46dee547ab60
         | 
| 3 | 
            +
            size 4972454376
         | 
    	
        checkpoint-570/model-00007-of-00007.safetensors
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:81c183a54b2f94bab76c992036ec9d20e8edfcd7752711f5ae889cc69f5f36ba
         | 
| 3 | 
            +
            size 3462482728
         | 
    	
        checkpoint-570/model.safetensors.index.json
    ADDED
    
    | @@ -0,0 +1,407 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "metadata": {
         | 
| 3 | 
            +
                "total_parameters": 8190735360,
         | 
| 4 | 
            +
                "total_size": 32762941440
         | 
| 5 | 
            +
              },
         | 
| 6 | 
            +
              "weight_map": {
         | 
| 7 | 
            +
                "lm_head.weight": "model-00007-of-00007.safetensors",
         | 
| 8 | 
            +
                "model.embed_tokens.weight": "model-00001-of-00007.safetensors",
         | 
| 9 | 
            +
                "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors",
         | 
| 10 | 
            +
                "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 11 | 
            +
                "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 12 | 
            +
                "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 13 | 
            +
                "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
         | 
| 14 | 
            +
                "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00007.safetensors",
         | 
| 15 | 
            +
                "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 16 | 
            +
                "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 17 | 
            +
                "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00007.safetensors",
         | 
| 18 | 
            +
                "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 19 | 
            +
                "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 20 | 
            +
                "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors",
         | 
| 21 | 
            +
                "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 22 | 
            +
                "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 23 | 
            +
                "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 24 | 
            +
                "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
         | 
| 25 | 
            +
                "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00007.safetensors",
         | 
| 26 | 
            +
                "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 27 | 
            +
                "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 28 | 
            +
                "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00007.safetensors",
         | 
| 29 | 
            +
                "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 30 | 
            +
                "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 31 | 
            +
                "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 32 | 
            +
                "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 33 | 
            +
                "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 34 | 
            +
                "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 35 | 
            +
                "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 36 | 
            +
                "model.layers.10.self_attn.k_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 37 | 
            +
                "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 38 | 
            +
                "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 39 | 
            +
                "model.layers.10.self_attn.q_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 40 | 
            +
                "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 41 | 
            +
                "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 42 | 
            +
                "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 43 | 
            +
                "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 44 | 
            +
                "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 45 | 
            +
                "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 46 | 
            +
                "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 47 | 
            +
                "model.layers.11.self_attn.k_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 48 | 
            +
                "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 49 | 
            +
                "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 50 | 
            +
                "model.layers.11.self_attn.q_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 51 | 
            +
                "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 52 | 
            +
                "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 53 | 
            +
                "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 54 | 
            +
                "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 55 | 
            +
                "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 56 | 
            +
                "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 57 | 
            +
                "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 58 | 
            +
                "model.layers.12.self_attn.k_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 59 | 
            +
                "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 60 | 
            +
                "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 61 | 
            +
                "model.layers.12.self_attn.q_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 62 | 
            +
                "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 63 | 
            +
                "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 64 | 
            +
                "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 65 | 
            +
                "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 66 | 
            +
                "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 67 | 
            +
                "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 68 | 
            +
                "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 69 | 
            +
                "model.layers.13.self_attn.k_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 70 | 
            +
                "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 71 | 
            +
                "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 72 | 
            +
                "model.layers.13.self_attn.q_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 73 | 
            +
                "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 74 | 
            +
                "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 75 | 
            +
                "model.layers.14.input_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 76 | 
            +
                "model.layers.14.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 77 | 
            +
                "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 78 | 
            +
                "model.layers.14.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 79 | 
            +
                "model.layers.14.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 80 | 
            +
                "model.layers.14.self_attn.k_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 81 | 
            +
                "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 82 | 
            +
                "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 83 | 
            +
                "model.layers.14.self_attn.q_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 84 | 
            +
                "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 85 | 
            +
                "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 86 | 
            +
                "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 87 | 
            +
                "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 88 | 
            +
                "model.layers.15.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 89 | 
            +
                "model.layers.15.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 90 | 
            +
                "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 91 | 
            +
                "model.layers.15.self_attn.k_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 92 | 
            +
                "model.layers.15.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 93 | 
            +
                "model.layers.15.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 94 | 
            +
                "model.layers.15.self_attn.q_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 95 | 
            +
                "model.layers.15.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 96 | 
            +
                "model.layers.15.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 97 | 
            +
                "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 98 | 
            +
                "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 99 | 
            +
                "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 100 | 
            +
                "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 101 | 
            +
                "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 102 | 
            +
                "model.layers.16.self_attn.k_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 103 | 
            +
                "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 104 | 
            +
                "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 105 | 
            +
                "model.layers.16.self_attn.q_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 106 | 
            +
                "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 107 | 
            +
                "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 108 | 
            +
                "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 109 | 
            +
                "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 110 | 
            +
                "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 111 | 
            +
                "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 112 | 
            +
                "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 113 | 
            +
                "model.layers.17.self_attn.k_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 114 | 
            +
                "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 115 | 
            +
                "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 116 | 
            +
                "model.layers.17.self_attn.q_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 117 | 
            +
                "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 118 | 
            +
                "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 119 | 
            +
                "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 120 | 
            +
                "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 121 | 
            +
                "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 122 | 
            +
                "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 123 | 
            +
                "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 124 | 
            +
                "model.layers.18.self_attn.k_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 125 | 
            +
                "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 126 | 
            +
                "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 127 | 
            +
                "model.layers.18.self_attn.q_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 128 | 
            +
                "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 129 | 
            +
                "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 130 | 
            +
                "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 131 | 
            +
                "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 132 | 
            +
                "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 133 | 
            +
                "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 134 | 
            +
                "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 135 | 
            +
                "model.layers.19.self_attn.k_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 136 | 
            +
                "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 137 | 
            +
                "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 138 | 
            +
                "model.layers.19.self_attn.q_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 139 | 
            +
                "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 140 | 
            +
                "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 141 | 
            +
                "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors",
         | 
| 142 | 
            +
                "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 143 | 
            +
                "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 144 | 
            +
                "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 145 | 
            +
                "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
         | 
| 146 | 
            +
                "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00007.safetensors",
         | 
| 147 | 
            +
                "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 148 | 
            +
                "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 149 | 
            +
                "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00007.safetensors",
         | 
| 150 | 
            +
                "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 151 | 
            +
                "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 152 | 
            +
                "model.layers.20.input_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 153 | 
            +
                "model.layers.20.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 154 | 
            +
                "model.layers.20.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 155 | 
            +
                "model.layers.20.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 156 | 
            +
                "model.layers.20.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 157 | 
            +
                "model.layers.20.self_attn.k_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 158 | 
            +
                "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 159 | 
            +
                "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 160 | 
            +
                "model.layers.20.self_attn.q_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 161 | 
            +
                "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 162 | 
            +
                "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 163 | 
            +
                "model.layers.21.input_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 164 | 
            +
                "model.layers.21.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 165 | 
            +
                "model.layers.21.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 166 | 
            +
                "model.layers.21.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 167 | 
            +
                "model.layers.21.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 168 | 
            +
                "model.layers.21.self_attn.k_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 169 | 
            +
                "model.layers.21.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 170 | 
            +
                "model.layers.21.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 171 | 
            +
                "model.layers.21.self_attn.q_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 172 | 
            +
                "model.layers.21.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 173 | 
            +
                "model.layers.21.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 174 | 
            +
                "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 175 | 
            +
                "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 176 | 
            +
                "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 177 | 
            +
                "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 178 | 
            +
                "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 179 | 
            +
                "model.layers.22.self_attn.k_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 180 | 
            +
                "model.layers.22.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 181 | 
            +
                "model.layers.22.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 182 | 
            +
                "model.layers.22.self_attn.q_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 183 | 
            +
                "model.layers.22.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 184 | 
            +
                "model.layers.22.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 185 | 
            +
                "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 186 | 
            +
                "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 187 | 
            +
                "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 188 | 
            +
                "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 189 | 
            +
                "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 190 | 
            +
                "model.layers.23.self_attn.k_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 191 | 
            +
                "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 192 | 
            +
                "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 193 | 
            +
                "model.layers.23.self_attn.q_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 194 | 
            +
                "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 195 | 
            +
                "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 196 | 
            +
                "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 197 | 
            +
                "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 198 | 
            +
                "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 199 | 
            +
                "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 200 | 
            +
                "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 201 | 
            +
                "model.layers.24.self_attn.k_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 202 | 
            +
                "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 203 | 
            +
                "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 204 | 
            +
                "model.layers.24.self_attn.q_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 205 | 
            +
                "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 206 | 
            +
                "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 207 | 
            +
                "model.layers.25.input_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 208 | 
            +
                "model.layers.25.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 209 | 
            +
                "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 210 | 
            +
                "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 211 | 
            +
                "model.layers.25.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 212 | 
            +
                "model.layers.25.self_attn.k_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 213 | 
            +
                "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 214 | 
            +
                "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 215 | 
            +
                "model.layers.25.self_attn.q_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 216 | 
            +
                "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 217 | 
            +
                "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 218 | 
            +
                "model.layers.26.input_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 219 | 
            +
                "model.layers.26.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 220 | 
            +
                "model.layers.26.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 221 | 
            +
                "model.layers.26.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 222 | 
            +
                "model.layers.26.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 223 | 
            +
                "model.layers.26.self_attn.k_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 224 | 
            +
                "model.layers.26.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 225 | 
            +
                "model.layers.26.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 226 | 
            +
                "model.layers.26.self_attn.q_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 227 | 
            +
                "model.layers.26.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 228 | 
            +
                "model.layers.26.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 229 | 
            +
                "model.layers.27.input_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 230 | 
            +
                "model.layers.27.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 231 | 
            +
                "model.layers.27.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 232 | 
            +
                "model.layers.27.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 233 | 
            +
                "model.layers.27.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 234 | 
            +
                "model.layers.27.self_attn.k_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 235 | 
            +
                "model.layers.27.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 236 | 
            +
                "model.layers.27.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 237 | 
            +
                "model.layers.27.self_attn.q_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 238 | 
            +
                "model.layers.27.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 239 | 
            +
                "model.layers.27.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 240 | 
            +
                "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 241 | 
            +
                "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 242 | 
            +
                "model.layers.28.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 243 | 
            +
                "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 244 | 
            +
                "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 245 | 
            +
                "model.layers.28.self_attn.k_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 246 | 
            +
                "model.layers.28.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 247 | 
            +
                "model.layers.28.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 248 | 
            +
                "model.layers.28.self_attn.q_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 249 | 
            +
                "model.layers.28.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 250 | 
            +
                "model.layers.28.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 251 | 
            +
                "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 252 | 
            +
                "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 253 | 
            +
                "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 254 | 
            +
                "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 255 | 
            +
                "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 256 | 
            +
                "model.layers.29.self_attn.k_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 257 | 
            +
                "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 258 | 
            +
                "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 259 | 
            +
                "model.layers.29.self_attn.q_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 260 | 
            +
                "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 261 | 
            +
                "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 262 | 
            +
                "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 263 | 
            +
                "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 264 | 
            +
                "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 265 | 
            +
                "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 266 | 
            +
                "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 267 | 
            +
                "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00007.safetensors",
         | 
| 268 | 
            +
                "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 269 | 
            +
                "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 270 | 
            +
                "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00007.safetensors",
         | 
| 271 | 
            +
                "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 272 | 
            +
                "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 273 | 
            +
                "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 274 | 
            +
                "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 275 | 
            +
                "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 276 | 
            +
                "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 277 | 
            +
                "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 278 | 
            +
                "model.layers.30.self_attn.k_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 279 | 
            +
                "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 280 | 
            +
                "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 281 | 
            +
                "model.layers.30.self_attn.q_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 282 | 
            +
                "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 283 | 
            +
                "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 284 | 
            +
                "model.layers.31.input_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 285 | 
            +
                "model.layers.31.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 286 | 
            +
                "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 287 | 
            +
                "model.layers.31.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 288 | 
            +
                "model.layers.31.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 289 | 
            +
                "model.layers.31.self_attn.k_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 290 | 
            +
                "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 291 | 
            +
                "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 292 | 
            +
                "model.layers.31.self_attn.q_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 293 | 
            +
                "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 294 | 
            +
                "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 295 | 
            +
                "model.layers.32.input_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 296 | 
            +
                "model.layers.32.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 297 | 
            +
                "model.layers.32.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 298 | 
            +
                "model.layers.32.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 299 | 
            +
                "model.layers.32.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 300 | 
            +
                "model.layers.32.self_attn.k_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 301 | 
            +
                "model.layers.32.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 302 | 
            +
                "model.layers.32.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 303 | 
            +
                "model.layers.32.self_attn.q_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 304 | 
            +
                "model.layers.32.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 305 | 
            +
                "model.layers.32.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 306 | 
            +
                "model.layers.33.input_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 307 | 
            +
                "model.layers.33.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 308 | 
            +
                "model.layers.33.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 309 | 
            +
                "model.layers.33.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 310 | 
            +
                "model.layers.33.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 311 | 
            +
                "model.layers.33.self_attn.k_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 312 | 
            +
                "model.layers.33.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 313 | 
            +
                "model.layers.33.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 314 | 
            +
                "model.layers.33.self_attn.q_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 315 | 
            +
                "model.layers.33.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 316 | 
            +
                "model.layers.33.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 317 | 
            +
                "model.layers.34.input_layernorm.weight": "model-00007-of-00007.safetensors",
         | 
| 318 | 
            +
                "model.layers.34.mlp.down_proj.weight": "model-00007-of-00007.safetensors",
         | 
| 319 | 
            +
                "model.layers.34.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 320 | 
            +
                "model.layers.34.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 321 | 
            +
                "model.layers.34.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
         | 
| 322 | 
            +
                "model.layers.34.self_attn.k_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 323 | 
            +
                "model.layers.34.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 324 | 
            +
                "model.layers.34.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 325 | 
            +
                "model.layers.34.self_attn.q_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 326 | 
            +
                "model.layers.34.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 327 | 
            +
                "model.layers.34.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 328 | 
            +
                "model.layers.35.input_layernorm.weight": "model-00007-of-00007.safetensors",
         | 
| 329 | 
            +
                "model.layers.35.mlp.down_proj.weight": "model-00007-of-00007.safetensors",
         | 
| 330 | 
            +
                "model.layers.35.mlp.gate_proj.weight": "model-00007-of-00007.safetensors",
         | 
| 331 | 
            +
                "model.layers.35.mlp.up_proj.weight": "model-00007-of-00007.safetensors",
         | 
| 332 | 
            +
                "model.layers.35.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
         | 
| 333 | 
            +
                "model.layers.35.self_attn.k_norm.weight": "model-00007-of-00007.safetensors",
         | 
| 334 | 
            +
                "model.layers.35.self_attn.k_proj.weight": "model-00007-of-00007.safetensors",
         | 
| 335 | 
            +
                "model.layers.35.self_attn.o_proj.weight": "model-00007-of-00007.safetensors",
         | 
| 336 | 
            +
                "model.layers.35.self_attn.q_norm.weight": "model-00007-of-00007.safetensors",
         | 
| 337 | 
            +
                "model.layers.35.self_attn.q_proj.weight": "model-00007-of-00007.safetensors",
         | 
| 338 | 
            +
                "model.layers.35.self_attn.v_proj.weight": "model-00007-of-00007.safetensors",
         | 
| 339 | 
            +
                "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 340 | 
            +
                "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 341 | 
            +
                "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 342 | 
            +
                "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 343 | 
            +
                "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 344 | 
            +
                "model.layers.4.self_attn.k_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 345 | 
            +
                "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 346 | 
            +
                "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 347 | 
            +
                "model.layers.4.self_attn.q_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 348 | 
            +
                "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 349 | 
            +
                "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 350 | 
            +
                "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 351 | 
            +
                "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 352 | 
            +
                "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 353 | 
            +
                "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 354 | 
            +
                "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 355 | 
            +
                "model.layers.5.self_attn.k_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 356 | 
            +
                "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 357 | 
            +
                "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 358 | 
            +
                "model.layers.5.self_attn.q_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 359 | 
            +
                "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 360 | 
            +
                "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 361 | 
            +
                "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 362 | 
            +
                "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 363 | 
            +
                "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 364 | 
            +
                "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 365 | 
            +
                "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 366 | 
            +
                "model.layers.6.self_attn.k_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 367 | 
            +
                "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 368 | 
            +
                "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 369 | 
            +
                "model.layers.6.self_attn.q_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 370 | 
            +
                "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 371 | 
            +
                "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 372 | 
            +
                "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 373 | 
            +
                "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 374 | 
            +
                "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 375 | 
            +
                "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 376 | 
            +
                "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 377 | 
            +
                "model.layers.7.self_attn.k_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 378 | 
            +
                "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 379 | 
            +
                "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 380 | 
            +
                "model.layers.7.self_attn.q_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 381 | 
            +
                "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 382 | 
            +
                "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 383 | 
            +
                "model.layers.8.input_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 384 | 
            +
                "model.layers.8.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 385 | 
            +
                "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 386 | 
            +
                "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 387 | 
            +
                "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 388 | 
            +
                "model.layers.8.self_attn.k_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 389 | 
            +
                "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 390 | 
            +
                "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 391 | 
            +
                "model.layers.8.self_attn.q_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 392 | 
            +
                "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 393 | 
            +
                "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 394 | 
            +
                "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 395 | 
            +
                "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 396 | 
            +
                "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 397 | 
            +
                "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 398 | 
            +
                "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 399 | 
            +
                "model.layers.9.self_attn.k_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 400 | 
            +
                "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 401 | 
            +
                "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 402 | 
            +
                "model.layers.9.self_attn.q_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 403 | 
            +
                "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 404 | 
            +
                "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 405 | 
            +
                "model.norm.weight": "model-00007-of-00007.safetensors"
         | 
| 406 | 
            +
              }
         | 
| 407 | 
            +
            }
         | 
    	
        checkpoint-570/rng_state.pth
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:2534e434cd5abbb8f7668d3eab0549db0ef95d6a797a3efa86b712e8e32266a7
         | 
| 3 | 
            +
            size 14645
         | 
    	
        checkpoint-570/scheduler.pt
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:ba91c5ffb5111b00b446be9dd68abfffd444b80c59c8a6f2ffb343727fdee5bc
         | 
| 3 | 
            +
            size 1465
         | 
    	
        checkpoint-570/special_tokens_map.json
    ADDED
    
    | @@ -0,0 +1,31 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "additional_special_tokens": [
         | 
| 3 | 
            +
                "<|im_start|>",
         | 
| 4 | 
            +
                "<|im_end|>",
         | 
| 5 | 
            +
                "<|object_ref_start|>",
         | 
| 6 | 
            +
                "<|object_ref_end|>",
         | 
| 7 | 
            +
                "<|box_start|>",
         | 
| 8 | 
            +
                "<|box_end|>",
         | 
| 9 | 
            +
                "<|quad_start|>",
         | 
| 10 | 
            +
                "<|quad_end|>",
         | 
| 11 | 
            +
                "<|vision_start|>",
         | 
| 12 | 
            +
                "<|vision_end|>",
         | 
| 13 | 
            +
                "<|vision_pad|>",
         | 
| 14 | 
            +
                "<|image_pad|>",
         | 
| 15 | 
            +
                "<|video_pad|>"
         | 
| 16 | 
            +
              ],
         | 
| 17 | 
            +
              "eos_token": {
         | 
| 18 | 
            +
                "content": "<|im_end|>",
         | 
| 19 | 
            +
                "lstrip": false,
         | 
| 20 | 
            +
                "normalized": false,
         | 
| 21 | 
            +
                "rstrip": false,
         | 
| 22 | 
            +
                "single_word": false
         | 
| 23 | 
            +
              },
         | 
| 24 | 
            +
              "pad_token": {
         | 
| 25 | 
            +
                "content": "<|endoftext|>",
         | 
| 26 | 
            +
                "lstrip": false,
         | 
| 27 | 
            +
                "normalized": false,
         | 
| 28 | 
            +
                "rstrip": false,
         | 
| 29 | 
            +
                "single_word": false
         | 
| 30 | 
            +
              }
         | 
| 31 | 
            +
            }
         | 
    	
        checkpoint-570/tokenizer.json
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
         | 
| 3 | 
            +
            size 11422654
         | 
    	
        checkpoint-570/tokenizer_config.json
    ADDED
    
    | @@ -0,0 +1,239 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "add_bos_token": false,
         | 
| 3 | 
            +
              "add_prefix_space": false,
         | 
| 4 | 
            +
              "added_tokens_decoder": {
         | 
| 5 | 
            +
                "151643": {
         | 
| 6 | 
            +
                  "content": "<|endoftext|>",
         | 
| 7 | 
            +
                  "lstrip": false,
         | 
| 8 | 
            +
                  "normalized": false,
         | 
| 9 | 
            +
                  "rstrip": false,
         | 
| 10 | 
            +
                  "single_word": false,
         | 
| 11 | 
            +
                  "special": true
         | 
| 12 | 
            +
                },
         | 
| 13 | 
            +
                "151644": {
         | 
| 14 | 
            +
                  "content": "<|im_start|>",
         | 
| 15 | 
            +
                  "lstrip": false,
         | 
| 16 | 
            +
                  "normalized": false,
         | 
| 17 | 
            +
                  "rstrip": false,
         | 
| 18 | 
            +
                  "single_word": false,
         | 
| 19 | 
            +
                  "special": true
         | 
| 20 | 
            +
                },
         | 
| 21 | 
            +
                "151645": {
         | 
| 22 | 
            +
                  "content": "<|im_end|>",
         | 
| 23 | 
            +
                  "lstrip": false,
         | 
| 24 | 
            +
                  "normalized": false,
         | 
| 25 | 
            +
                  "rstrip": false,
         | 
| 26 | 
            +
                  "single_word": false,
         | 
| 27 | 
            +
                  "special": true
         | 
| 28 | 
            +
                },
         | 
| 29 | 
            +
                "151646": {
         | 
| 30 | 
            +
                  "content": "<|object_ref_start|>",
         | 
| 31 | 
            +
                  "lstrip": false,
         | 
| 32 | 
            +
                  "normalized": false,
         | 
| 33 | 
            +
                  "rstrip": false,
         | 
| 34 | 
            +
                  "single_word": false,
         | 
| 35 | 
            +
                  "special": true
         | 
| 36 | 
            +
                },
         | 
| 37 | 
            +
                "151647": {
         | 
| 38 | 
            +
                  "content": "<|object_ref_end|>",
         | 
| 39 | 
            +
                  "lstrip": false,
         | 
| 40 | 
            +
                  "normalized": false,
         | 
| 41 | 
            +
                  "rstrip": false,
         | 
| 42 | 
            +
                  "single_word": false,
         | 
| 43 | 
            +
                  "special": true
         | 
| 44 | 
            +
                },
         | 
| 45 | 
            +
                "151648": {
         | 
| 46 | 
            +
                  "content": "<|box_start|>",
         | 
| 47 | 
            +
                  "lstrip": false,
         | 
| 48 | 
            +
                  "normalized": false,
         | 
| 49 | 
            +
                  "rstrip": false,
         | 
| 50 | 
            +
                  "single_word": false,
         | 
| 51 | 
            +
                  "special": true
         | 
| 52 | 
            +
                },
         | 
| 53 | 
            +
                "151649": {
         | 
| 54 | 
            +
                  "content": "<|box_end|>",
         | 
| 55 | 
            +
                  "lstrip": false,
         | 
| 56 | 
            +
                  "normalized": false,
         | 
| 57 | 
            +
                  "rstrip": false,
         | 
| 58 | 
            +
                  "single_word": false,
         | 
| 59 | 
            +
                  "special": true
         | 
| 60 | 
            +
                },
         | 
| 61 | 
            +
                "151650": {
         | 
| 62 | 
            +
                  "content": "<|quad_start|>",
         | 
| 63 | 
            +
                  "lstrip": false,
         | 
| 64 | 
            +
                  "normalized": false,
         | 
| 65 | 
            +
                  "rstrip": false,
         | 
| 66 | 
            +
                  "single_word": false,
         | 
| 67 | 
            +
                  "special": true
         | 
| 68 | 
            +
                },
         | 
| 69 | 
            +
                "151651": {
         | 
| 70 | 
            +
                  "content": "<|quad_end|>",
         | 
| 71 | 
            +
                  "lstrip": false,
         | 
| 72 | 
            +
                  "normalized": false,
         | 
| 73 | 
            +
                  "rstrip": false,
         | 
| 74 | 
            +
                  "single_word": false,
         | 
| 75 | 
            +
                  "special": true
         | 
| 76 | 
            +
                },
         | 
| 77 | 
            +
                "151652": {
         | 
| 78 | 
            +
                  "content": "<|vision_start|>",
         | 
| 79 | 
            +
                  "lstrip": false,
         | 
| 80 | 
            +
                  "normalized": false,
         | 
| 81 | 
            +
                  "rstrip": false,
         | 
| 82 | 
            +
                  "single_word": false,
         | 
| 83 | 
            +
                  "special": true
         | 
| 84 | 
            +
                },
         | 
| 85 | 
            +
                "151653": {
         | 
| 86 | 
            +
                  "content": "<|vision_end|>",
         | 
| 87 | 
            +
                  "lstrip": false,
         | 
| 88 | 
            +
                  "normalized": false,
         | 
| 89 | 
            +
                  "rstrip": false,
         | 
| 90 | 
            +
                  "single_word": false,
         | 
| 91 | 
            +
                  "special": true
         | 
| 92 | 
            +
                },
         | 
| 93 | 
            +
                "151654": {
         | 
| 94 | 
            +
                  "content": "<|vision_pad|>",
         | 
| 95 | 
            +
                  "lstrip": false,
         | 
| 96 | 
            +
                  "normalized": false,
         | 
| 97 | 
            +
                  "rstrip": false,
         | 
| 98 | 
            +
                  "single_word": false,
         | 
| 99 | 
            +
                  "special": true
         | 
| 100 | 
            +
                },
         | 
| 101 | 
            +
                "151655": {
         | 
| 102 | 
            +
                  "content": "<|image_pad|>",
         | 
| 103 | 
            +
                  "lstrip": false,
         | 
| 104 | 
            +
                  "normalized": false,
         | 
| 105 | 
            +
                  "rstrip": false,
         | 
| 106 | 
            +
                  "single_word": false,
         | 
| 107 | 
            +
                  "special": true
         | 
| 108 | 
            +
                },
         | 
| 109 | 
            +
                "151656": {
         | 
| 110 | 
            +
                  "content": "<|video_pad|>",
         | 
| 111 | 
            +
                  "lstrip": false,
         | 
| 112 | 
            +
                  "normalized": false,
         | 
| 113 | 
            +
                  "rstrip": false,
         | 
| 114 | 
            +
                  "single_word": false,
         | 
| 115 | 
            +
                  "special": true
         | 
| 116 | 
            +
                },
         | 
| 117 | 
            +
                "151657": {
         | 
| 118 | 
            +
                  "content": "<tool_call>",
         | 
| 119 | 
            +
                  "lstrip": false,
         | 
| 120 | 
            +
                  "normalized": false,
         | 
| 121 | 
            +
                  "rstrip": false,
         | 
| 122 | 
            +
                  "single_word": false,
         | 
| 123 | 
            +
                  "special": false
         | 
| 124 | 
            +
                },
         | 
| 125 | 
            +
                "151658": {
         | 
| 126 | 
            +
                  "content": "</tool_call>",
         | 
| 127 | 
            +
                  "lstrip": false,
         | 
| 128 | 
            +
                  "normalized": false,
         | 
| 129 | 
            +
                  "rstrip": false,
         | 
| 130 | 
            +
                  "single_word": false,
         | 
| 131 | 
            +
                  "special": false
         | 
| 132 | 
            +
                },
         | 
| 133 | 
            +
                "151659": {
         | 
| 134 | 
            +
                  "content": "<|fim_prefix|>",
         | 
| 135 | 
            +
                  "lstrip": false,
         | 
| 136 | 
            +
                  "normalized": false,
         | 
| 137 | 
            +
                  "rstrip": false,
         | 
| 138 | 
            +
                  "single_word": false,
         | 
| 139 | 
            +
                  "special": false
         | 
| 140 | 
            +
                },
         | 
| 141 | 
            +
                "151660": {
         | 
| 142 | 
            +
                  "content": "<|fim_middle|>",
         | 
| 143 | 
            +
                  "lstrip": false,
         | 
| 144 | 
            +
                  "normalized": false,
         | 
| 145 | 
            +
                  "rstrip": false,
         | 
| 146 | 
            +
                  "single_word": false,
         | 
| 147 | 
            +
                  "special": false
         | 
| 148 | 
            +
                },
         | 
| 149 | 
            +
                "151661": {
         | 
| 150 | 
            +
                  "content": "<|fim_suffix|>",
         | 
| 151 | 
            +
                  "lstrip": false,
         | 
| 152 | 
            +
                  "normalized": false,
         | 
| 153 | 
            +
                  "rstrip": false,
         | 
| 154 | 
            +
                  "single_word": false,
         | 
| 155 | 
            +
                  "special": false
         | 
| 156 | 
            +
                },
         | 
| 157 | 
            +
                "151662": {
         | 
| 158 | 
            +
                  "content": "<|fim_pad|>",
         | 
| 159 | 
            +
                  "lstrip": false,
         | 
| 160 | 
            +
                  "normalized": false,
         | 
| 161 | 
            +
                  "rstrip": false,
         | 
| 162 | 
            +
                  "single_word": false,
         | 
| 163 | 
            +
                  "special": false
         | 
| 164 | 
            +
                },
         | 
| 165 | 
            +
                "151663": {
         | 
| 166 | 
            +
                  "content": "<|repo_name|>",
         | 
| 167 | 
            +
                  "lstrip": false,
         | 
| 168 | 
            +
                  "normalized": false,
         | 
| 169 | 
            +
                  "rstrip": false,
         | 
| 170 | 
            +
                  "single_word": false,
         | 
| 171 | 
            +
                  "special": false
         | 
| 172 | 
            +
                },
         | 
| 173 | 
            +
                "151664": {
         | 
| 174 | 
            +
                  "content": "<|file_sep|>",
         | 
| 175 | 
            +
                  "lstrip": false,
         | 
| 176 | 
            +
                  "normalized": false,
         | 
| 177 | 
            +
                  "rstrip": false,
         | 
| 178 | 
            +
                  "single_word": false,
         | 
| 179 | 
            +
                  "special": false
         | 
| 180 | 
            +
                },
         | 
| 181 | 
            +
                "151665": {
         | 
| 182 | 
            +
                  "content": "<tool_response>",
         | 
| 183 | 
            +
                  "lstrip": false,
         | 
| 184 | 
            +
                  "normalized": false,
         | 
| 185 | 
            +
                  "rstrip": false,
         | 
| 186 | 
            +
                  "single_word": false,
         | 
| 187 | 
            +
                  "special": false
         | 
| 188 | 
            +
                },
         | 
| 189 | 
            +
                "151666": {
         | 
| 190 | 
            +
                  "content": "</tool_response>",
         | 
| 191 | 
            +
                  "lstrip": false,
         | 
| 192 | 
            +
                  "normalized": false,
         | 
| 193 | 
            +
                  "rstrip": false,
         | 
| 194 | 
            +
                  "single_word": false,
         | 
| 195 | 
            +
                  "special": false
         | 
| 196 | 
            +
                },
         | 
| 197 | 
            +
                "151667": {
         | 
| 198 | 
            +
                  "content": "<think>",
         | 
| 199 | 
            +
                  "lstrip": false,
         | 
| 200 | 
            +
                  "normalized": false,
         | 
| 201 | 
            +
                  "rstrip": false,
         | 
| 202 | 
            +
                  "single_word": false,
         | 
| 203 | 
            +
                  "special": false
         | 
| 204 | 
            +
                },
         | 
| 205 | 
            +
                "151668": {
         | 
| 206 | 
            +
                  "content": "</think>",
         | 
| 207 | 
            +
                  "lstrip": false,
         | 
| 208 | 
            +
                  "normalized": false,
         | 
| 209 | 
            +
                  "rstrip": false,
         | 
| 210 | 
            +
                  "single_word": false,
         | 
| 211 | 
            +
                  "special": false
         | 
| 212 | 
            +
                }
         | 
| 213 | 
            +
              },
         | 
| 214 | 
            +
              "additional_special_tokens": [
         | 
| 215 | 
            +
                "<|im_start|>",
         | 
| 216 | 
            +
                "<|im_end|>",
         | 
| 217 | 
            +
                "<|object_ref_start|>",
         | 
| 218 | 
            +
                "<|object_ref_end|>",
         | 
| 219 | 
            +
                "<|box_start|>",
         | 
| 220 | 
            +
                "<|box_end|>",
         | 
| 221 | 
            +
                "<|quad_start|>",
         | 
| 222 | 
            +
                "<|quad_end|>",
         | 
| 223 | 
            +
                "<|vision_start|>",
         | 
| 224 | 
            +
                "<|vision_end|>",
         | 
| 225 | 
            +
                "<|vision_pad|>",
         | 
| 226 | 
            +
                "<|image_pad|>",
         | 
| 227 | 
            +
                "<|video_pad|>"
         | 
| 228 | 
            +
              ],
         | 
| 229 | 
            +
              "bos_token": null,
         | 
| 230 | 
            +
              "clean_up_tokenization_spaces": false,
         | 
| 231 | 
            +
              "eos_token": "<|im_end|>",
         | 
| 232 | 
            +
              "errors": "replace",
         | 
| 233 | 
            +
              "extra_special_tokens": {},
         | 
| 234 | 
            +
              "model_max_length": 131072,
         | 
| 235 | 
            +
              "pad_token": "<|endoftext|>",
         | 
| 236 | 
            +
              "split_special_tokens": false,
         | 
| 237 | 
            +
              "tokenizer_class": "Qwen2Tokenizer",
         | 
| 238 | 
            +
              "unk_token": null
         | 
| 239 | 
            +
            }
         | 
    	
        checkpoint-570/trainer_state.json
    ADDED
    
    | @@ -0,0 +1,547 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "best_global_step": null,
         | 
| 3 | 
            +
              "best_metric": null,
         | 
| 4 | 
            +
              "best_model_checkpoint": null,
         | 
| 5 | 
            +
              "epoch": 3.0,
         | 
| 6 | 
            +
              "eval_steps": 500,
         | 
| 7 | 
            +
              "global_step": 570,
         | 
| 8 | 
            +
              "is_hyper_param_search": false,
         | 
| 9 | 
            +
              "is_local_process_zero": true,
         | 
| 10 | 
            +
              "is_world_process_zero": true,
         | 
| 11 | 
            +
              "log_history": [
         | 
| 12 | 
            +
                {
         | 
| 13 | 
            +
                  "epoch": 0.05263157894736842,
         | 
| 14 | 
            +
                  "grad_norm": 5.827399730682373,
         | 
| 15 | 
            +
                  "learning_rate": 1.968421052631579e-05,
         | 
| 16 | 
            +
                  "loss": 1.8455,
         | 
| 17 | 
            +
                  "mean_token_accuracy": 0.5674484133720398,
         | 
| 18 | 
            +
                  "num_tokens": 11541.0,
         | 
| 19 | 
            +
                  "step": 10
         | 
| 20 | 
            +
                },
         | 
| 21 | 
            +
                {
         | 
| 22 | 
            +
                  "epoch": 0.10526315789473684,
         | 
| 23 | 
            +
                  "grad_norm": 4.475362300872803,
         | 
| 24 | 
            +
                  "learning_rate": 1.9333333333333333e-05,
         | 
| 25 | 
            +
                  "loss": 1.5323,
         | 
| 26 | 
            +
                  "mean_token_accuracy": 0.6094174444675445,
         | 
| 27 | 
            +
                  "num_tokens": 22649.0,
         | 
| 28 | 
            +
                  "step": 20
         | 
| 29 | 
            +
                },
         | 
| 30 | 
            +
                {
         | 
| 31 | 
            +
                  "epoch": 0.15789473684210525,
         | 
| 32 | 
            +
                  "grad_norm": 4.522463321685791,
         | 
| 33 | 
            +
                  "learning_rate": 1.898245614035088e-05,
         | 
| 34 | 
            +
                  "loss": 1.5459,
         | 
| 35 | 
            +
                  "mean_token_accuracy": 0.6038810610771179,
         | 
| 36 | 
            +
                  "num_tokens": 33775.0,
         | 
| 37 | 
            +
                  "step": 30
         | 
| 38 | 
            +
                },
         | 
| 39 | 
            +
                {
         | 
| 40 | 
            +
                  "epoch": 0.21052631578947367,
         | 
| 41 | 
            +
                  "grad_norm": 4.1712775230407715,
         | 
| 42 | 
            +
                  "learning_rate": 1.8631578947368424e-05,
         | 
| 43 | 
            +
                  "loss": 1.5167,
         | 
| 44 | 
            +
                  "mean_token_accuracy": 0.611203521490097,
         | 
| 45 | 
            +
                  "num_tokens": 45182.0,
         | 
| 46 | 
            +
                  "step": 40
         | 
| 47 | 
            +
                },
         | 
| 48 | 
            +
                {
         | 
| 49 | 
            +
                  "epoch": 0.2631578947368421,
         | 
| 50 | 
            +
                  "grad_norm": 4.325103759765625,
         | 
| 51 | 
            +
                  "learning_rate": 1.8280701754385966e-05,
         | 
| 52 | 
            +
                  "loss": 1.5465,
         | 
| 53 | 
            +
                  "mean_token_accuracy": 0.6018322885036469,
         | 
| 54 | 
            +
                  "num_tokens": 56080.0,
         | 
| 55 | 
            +
                  "step": 50
         | 
| 56 | 
            +
                },
         | 
| 57 | 
            +
                {
         | 
| 58 | 
            +
                  "epoch": 0.3157894736842105,
         | 
| 59 | 
            +
                  "grad_norm": 3.4913244247436523,
         | 
| 60 | 
            +
                  "learning_rate": 1.792982456140351e-05,
         | 
| 61 | 
            +
                  "loss": 1.4975,
         | 
| 62 | 
            +
                  "mean_token_accuracy": 0.6165669798851013,
         | 
| 63 | 
            +
                  "num_tokens": 67393.0,
         | 
| 64 | 
            +
                  "step": 60
         | 
| 65 | 
            +
                },
         | 
| 66 | 
            +
                {
         | 
| 67 | 
            +
                  "epoch": 0.3684210526315789,
         | 
| 68 | 
            +
                  "grad_norm": 3.657413959503174,
         | 
| 69 | 
            +
                  "learning_rate": 1.7578947368421054e-05,
         | 
| 70 | 
            +
                  "loss": 1.4742,
         | 
| 71 | 
            +
                  "mean_token_accuracy": 0.6143243193626404,
         | 
| 72 | 
            +
                  "num_tokens": 78542.0,
         | 
| 73 | 
            +
                  "step": 70
         | 
| 74 | 
            +
                },
         | 
| 75 | 
            +
                {
         | 
| 76 | 
            +
                  "epoch": 0.42105263157894735,
         | 
| 77 | 
            +
                  "grad_norm": 3.5924668312072754,
         | 
| 78 | 
            +
                  "learning_rate": 1.72280701754386e-05,
         | 
| 79 | 
            +
                  "loss": 1.5084,
         | 
| 80 | 
            +
                  "mean_token_accuracy": 0.608461570739746,
         | 
| 81 | 
            +
                  "num_tokens": 89963.0,
         | 
| 82 | 
            +
                  "step": 80
         | 
| 83 | 
            +
                },
         | 
| 84 | 
            +
                {
         | 
| 85 | 
            +
                  "epoch": 0.47368421052631576,
         | 
| 86 | 
            +
                  "grad_norm": 3.244269609451294,
         | 
| 87 | 
            +
                  "learning_rate": 1.687719298245614e-05,
         | 
| 88 | 
            +
                  "loss": 1.4668,
         | 
| 89 | 
            +
                  "mean_token_accuracy": 0.6215168476104737,
         | 
| 90 | 
            +
                  "num_tokens": 101761.0,
         | 
| 91 | 
            +
                  "step": 90
         | 
| 92 | 
            +
                },
         | 
| 93 | 
            +
                {
         | 
| 94 | 
            +
                  "epoch": 0.5263157894736842,
         | 
| 95 | 
            +
                  "grad_norm": 3.2805733680725098,
         | 
| 96 | 
            +
                  "learning_rate": 1.6526315789473686e-05,
         | 
| 97 | 
            +
                  "loss": 1.5016,
         | 
| 98 | 
            +
                  "mean_token_accuracy": 0.6094469010829926,
         | 
| 99 | 
            +
                  "num_tokens": 112833.0,
         | 
| 100 | 
            +
                  "step": 100
         | 
| 101 | 
            +
                },
         | 
| 102 | 
            +
                {
         | 
| 103 | 
            +
                  "epoch": 0.5789473684210527,
         | 
| 104 | 
            +
                  "grad_norm": 3.5279650688171387,
         | 
| 105 | 
            +
                  "learning_rate": 1.6175438596491228e-05,
         | 
| 106 | 
            +
                  "loss": 1.4591,
         | 
| 107 | 
            +
                  "mean_token_accuracy": 0.6197056353092194,
         | 
| 108 | 
            +
                  "num_tokens": 123982.0,
         | 
| 109 | 
            +
                  "step": 110
         | 
| 110 | 
            +
                },
         | 
| 111 | 
            +
                {
         | 
| 112 | 
            +
                  "epoch": 0.631578947368421,
         | 
| 113 | 
            +
                  "grad_norm": 3.697514057159424,
         | 
| 114 | 
            +
                  "learning_rate": 1.5824561403508774e-05,
         | 
| 115 | 
            +
                  "loss": 1.4903,
         | 
| 116 | 
            +
                  "mean_token_accuracy": 0.6157862544059753,
         | 
| 117 | 
            +
                  "num_tokens": 135281.0,
         | 
| 118 | 
            +
                  "step": 120
         | 
| 119 | 
            +
                },
         | 
| 120 | 
            +
                {
         | 
| 121 | 
            +
                  "epoch": 0.6842105263157895,
         | 
| 122 | 
            +
                  "grad_norm": 3.5749077796936035,
         | 
| 123 | 
            +
                  "learning_rate": 1.5473684210526316e-05,
         | 
| 124 | 
            +
                  "loss": 1.4905,
         | 
| 125 | 
            +
                  "mean_token_accuracy": 0.6141646564006805,
         | 
| 126 | 
            +
                  "num_tokens": 146813.0,
         | 
| 127 | 
            +
                  "step": 130
         | 
| 128 | 
            +
                },
         | 
| 129 | 
            +
                {
         | 
| 130 | 
            +
                  "epoch": 0.7368421052631579,
         | 
| 131 | 
            +
                  "grad_norm": 3.9152824878692627,
         | 
| 132 | 
            +
                  "learning_rate": 1.5122807017543861e-05,
         | 
| 133 | 
            +
                  "loss": 1.4288,
         | 
| 134 | 
            +
                  "mean_token_accuracy": 0.6200086832046509,
         | 
| 135 | 
            +
                  "num_tokens": 157970.0,
         | 
| 136 | 
            +
                  "step": 140
         | 
| 137 | 
            +
                },
         | 
| 138 | 
            +
                {
         | 
| 139 | 
            +
                  "epoch": 0.7894736842105263,
         | 
| 140 | 
            +
                  "grad_norm": 3.2869362831115723,
         | 
| 141 | 
            +
                  "learning_rate": 1.4771929824561405e-05,
         | 
| 142 | 
            +
                  "loss": 1.4484,
         | 
| 143 | 
            +
                  "mean_token_accuracy": 0.6214825510978699,
         | 
| 144 | 
            +
                  "num_tokens": 169217.0,
         | 
| 145 | 
            +
                  "step": 150
         | 
| 146 | 
            +
                },
         | 
| 147 | 
            +
                {
         | 
| 148 | 
            +
                  "epoch": 0.8421052631578947,
         | 
| 149 | 
            +
                  "grad_norm": 3.530872106552124,
         | 
| 150 | 
            +
                  "learning_rate": 1.4421052631578948e-05,
         | 
| 151 | 
            +
                  "loss": 1.4638,
         | 
| 152 | 
            +
                  "mean_token_accuracy": 0.6164202988147736,
         | 
| 153 | 
            +
                  "num_tokens": 180220.0,
         | 
| 154 | 
            +
                  "step": 160
         | 
| 155 | 
            +
                },
         | 
| 156 | 
            +
                {
         | 
| 157 | 
            +
                  "epoch": 0.8947368421052632,
         | 
| 158 | 
            +
                  "grad_norm": 3.1449363231658936,
         | 
| 159 | 
            +
                  "learning_rate": 1.4070175438596492e-05,
         | 
| 160 | 
            +
                  "loss": 1.4679,
         | 
| 161 | 
            +
                  "mean_token_accuracy": 0.6190443515777588,
         | 
| 162 | 
            +
                  "num_tokens": 192002.0,
         | 
| 163 | 
            +
                  "step": 170
         | 
| 164 | 
            +
                },
         | 
| 165 | 
            +
                {
         | 
| 166 | 
            +
                  "epoch": 0.9473684210526315,
         | 
| 167 | 
            +
                  "grad_norm": 3.275167942047119,
         | 
| 168 | 
            +
                  "learning_rate": 1.3719298245614037e-05,
         | 
| 169 | 
            +
                  "loss": 1.421,
         | 
| 170 | 
            +
                  "mean_token_accuracy": 0.6280941605567932,
         | 
| 171 | 
            +
                  "num_tokens": 203462.0,
         | 
| 172 | 
            +
                  "step": 180
         | 
| 173 | 
            +
                },
         | 
| 174 | 
            +
                {
         | 
| 175 | 
            +
                  "epoch": 1.0,
         | 
| 176 | 
            +
                  "grad_norm": 4.7444939613342285,
         | 
| 177 | 
            +
                  "learning_rate": 1.336842105263158e-05,
         | 
| 178 | 
            +
                  "loss": 1.4532,
         | 
| 179 | 
            +
                  "mean_token_accuracy": 0.6205313920974731,
         | 
| 180 | 
            +
                  "num_tokens": 214070.0,
         | 
| 181 | 
            +
                  "step": 190
         | 
| 182 | 
            +
                },
         | 
| 183 | 
            +
                {
         | 
| 184 | 
            +
                  "epoch": 1.0526315789473684,
         | 
| 185 | 
            +
                  "grad_norm": 3.492197275161743,
         | 
| 186 | 
            +
                  "learning_rate": 1.3017543859649125e-05,
         | 
| 187 | 
            +
                  "loss": 0.8224,
         | 
| 188 | 
            +
                  "mean_token_accuracy": 0.7620033979415893,
         | 
| 189 | 
            +
                  "num_tokens": 225485.0,
         | 
| 190 | 
            +
                  "step": 200
         | 
| 191 | 
            +
                },
         | 
| 192 | 
            +
                {
         | 
| 193 | 
            +
                  "epoch": 1.1052631578947367,
         | 
| 194 | 
            +
                  "grad_norm": 2.8766427040100098,
         | 
| 195 | 
            +
                  "learning_rate": 1.2666666666666667e-05,
         | 
| 196 | 
            +
                  "loss": 0.7886,
         | 
| 197 | 
            +
                  "mean_token_accuracy": 0.7641067266464233,
         | 
| 198 | 
            +
                  "num_tokens": 236739.0,
         | 
| 199 | 
            +
                  "step": 210
         | 
| 200 | 
            +
                },
         | 
| 201 | 
            +
                {
         | 
| 202 | 
            +
                  "epoch": 1.1578947368421053,
         | 
| 203 | 
            +
                  "grad_norm": 3.0320887565612793,
         | 
| 204 | 
            +
                  "learning_rate": 1.2315789473684212e-05,
         | 
| 205 | 
            +
                  "loss": 0.7586,
         | 
| 206 | 
            +
                  "mean_token_accuracy": 0.7756024956703186,
         | 
| 207 | 
            +
                  "num_tokens": 248217.0,
         | 
| 208 | 
            +
                  "step": 220
         | 
| 209 | 
            +
                },
         | 
| 210 | 
            +
                {
         | 
| 211 | 
            +
                  "epoch": 1.2105263157894737,
         | 
| 212 | 
            +
                  "grad_norm": 3.0771596431732178,
         | 
| 213 | 
            +
                  "learning_rate": 1.1964912280701754e-05,
         | 
| 214 | 
            +
                  "loss": 0.8016,
         | 
| 215 | 
            +
                  "mean_token_accuracy": 0.7638419806957245,
         | 
| 216 | 
            +
                  "num_tokens": 259404.0,
         | 
| 217 | 
            +
                  "step": 230
         | 
| 218 | 
            +
                },
         | 
| 219 | 
            +
                {
         | 
| 220 | 
            +
                  "epoch": 1.263157894736842,
         | 
| 221 | 
            +
                  "grad_norm": 3.5975608825683594,
         | 
| 222 | 
            +
                  "learning_rate": 1.16140350877193e-05,
         | 
| 223 | 
            +
                  "loss": 0.7888,
         | 
| 224 | 
            +
                  "mean_token_accuracy": 0.7639639139175415,
         | 
| 225 | 
            +
                  "num_tokens": 270555.0,
         | 
| 226 | 
            +
                  "step": 240
         | 
| 227 | 
            +
                },
         | 
| 228 | 
            +
                {
         | 
| 229 | 
            +
                  "epoch": 1.3157894736842106,
         | 
| 230 | 
            +
                  "grad_norm": 3.4842450618743896,
         | 
| 231 | 
            +
                  "learning_rate": 1.1263157894736843e-05,
         | 
| 232 | 
            +
                  "loss": 0.7854,
         | 
| 233 | 
            +
                  "mean_token_accuracy": 0.7633719563484191,
         | 
| 234 | 
            +
                  "num_tokens": 281976.0,
         | 
| 235 | 
            +
                  "step": 250
         | 
| 236 | 
            +
                },
         | 
| 237 | 
            +
                {
         | 
| 238 | 
            +
                  "epoch": 1.368421052631579,
         | 
| 239 | 
            +
                  "grad_norm": 3.1366794109344482,
         | 
| 240 | 
            +
                  "learning_rate": 1.0912280701754387e-05,
         | 
| 241 | 
            +
                  "loss": 0.7904,
         | 
| 242 | 
            +
                  "mean_token_accuracy": 0.7618038773536682,
         | 
| 243 | 
            +
                  "num_tokens": 293356.0,
         | 
| 244 | 
            +
                  "step": 260
         | 
| 245 | 
            +
                },
         | 
| 246 | 
            +
                {
         | 
| 247 | 
            +
                  "epoch": 1.4210526315789473,
         | 
| 248 | 
            +
                  "grad_norm": 3.246885299682617,
         | 
| 249 | 
            +
                  "learning_rate": 1.056140350877193e-05,
         | 
| 250 | 
            +
                  "loss": 0.7745,
         | 
| 251 | 
            +
                  "mean_token_accuracy": 0.7625904023647309,
         | 
| 252 | 
            +
                  "num_tokens": 304686.0,
         | 
| 253 | 
            +
                  "step": 270
         | 
| 254 | 
            +
                },
         | 
| 255 | 
            +
                {
         | 
| 256 | 
            +
                  "epoch": 1.4736842105263157,
         | 
| 257 | 
            +
                  "grad_norm": 3.398383140563965,
         | 
| 258 | 
            +
                  "learning_rate": 1.0210526315789476e-05,
         | 
| 259 | 
            +
                  "loss": 0.8135,
         | 
| 260 | 
            +
                  "mean_token_accuracy": 0.7576355338096619,
         | 
| 261 | 
            +
                  "num_tokens": 315735.0,
         | 
| 262 | 
            +
                  "step": 280
         | 
| 263 | 
            +
                },
         | 
| 264 | 
            +
                {
         | 
| 265 | 
            +
                  "epoch": 1.526315789473684,
         | 
| 266 | 
            +
                  "grad_norm": 3.1334409713745117,
         | 
| 267 | 
            +
                  "learning_rate": 9.859649122807018e-06,
         | 
| 268 | 
            +
                  "loss": 0.8007,
         | 
| 269 | 
            +
                  "mean_token_accuracy": 0.7634194076061249,
         | 
| 270 | 
            +
                  "num_tokens": 326982.0,
         | 
| 271 | 
            +
                  "step": 290
         | 
| 272 | 
            +
                },
         | 
| 273 | 
            +
                {
         | 
| 274 | 
            +
                  "epoch": 1.5789473684210527,
         | 
| 275 | 
            +
                  "grad_norm": 3.123152017593384,
         | 
| 276 | 
            +
                  "learning_rate": 9.508771929824562e-06,
         | 
| 277 | 
            +
                  "loss": 0.7878,
         | 
| 278 | 
            +
                  "mean_token_accuracy": 0.7639511227607727,
         | 
| 279 | 
            +
                  "num_tokens": 338160.0,
         | 
| 280 | 
            +
                  "step": 300
         | 
| 281 | 
            +
                },
         | 
| 282 | 
            +
                {
         | 
| 283 | 
            +
                  "epoch": 1.631578947368421,
         | 
| 284 | 
            +
                  "grad_norm": 3.388254404067993,
         | 
| 285 | 
            +
                  "learning_rate": 9.157894736842105e-06,
         | 
| 286 | 
            +
                  "loss": 0.7935,
         | 
| 287 | 
            +
                  "mean_token_accuracy": 0.7603401899337768,
         | 
| 288 | 
            +
                  "num_tokens": 349306.0,
         | 
| 289 | 
            +
                  "step": 310
         | 
| 290 | 
            +
                },
         | 
| 291 | 
            +
                {
         | 
| 292 | 
            +
                  "epoch": 1.6842105263157894,
         | 
| 293 | 
            +
                  "grad_norm": 3.4574532508850098,
         | 
| 294 | 
            +
                  "learning_rate": 8.807017543859649e-06,
         | 
| 295 | 
            +
                  "loss": 0.7912,
         | 
| 296 | 
            +
                  "mean_token_accuracy": 0.759734469652176,
         | 
| 297 | 
            +
                  "num_tokens": 360346.0,
         | 
| 298 | 
            +
                  "step": 320
         | 
| 299 | 
            +
                },
         | 
| 300 | 
            +
                {
         | 
| 301 | 
            +
                  "epoch": 1.736842105263158,
         | 
| 302 | 
            +
                  "grad_norm": 2.8878493309020996,
         | 
| 303 | 
            +
                  "learning_rate": 8.456140350877193e-06,
         | 
| 304 | 
            +
                  "loss": 0.7664,
         | 
| 305 | 
            +
                  "mean_token_accuracy": 0.7666481971740723,
         | 
| 306 | 
            +
                  "num_tokens": 371823.0,
         | 
| 307 | 
            +
                  "step": 330
         | 
| 308 | 
            +
                },
         | 
| 309 | 
            +
                {
         | 
| 310 | 
            +
                  "epoch": 1.7894736842105263,
         | 
| 311 | 
            +
                  "grad_norm": 3.632908582687378,
         | 
| 312 | 
            +
                  "learning_rate": 8.105263157894736e-06,
         | 
| 313 | 
            +
                  "loss": 0.7792,
         | 
| 314 | 
            +
                  "mean_token_accuracy": 0.7634695649147034,
         | 
| 315 | 
            +
                  "num_tokens": 383113.0,
         | 
| 316 | 
            +
                  "step": 340
         | 
| 317 | 
            +
                },
         | 
| 318 | 
            +
                {
         | 
| 319 | 
            +
                  "epoch": 1.8421052631578947,
         | 
| 320 | 
            +
                  "grad_norm": 3.0787177085876465,
         | 
| 321 | 
            +
                  "learning_rate": 7.754385964912282e-06,
         | 
| 322 | 
            +
                  "loss": 0.7973,
         | 
| 323 | 
            +
                  "mean_token_accuracy": 0.759224247932434,
         | 
| 324 | 
            +
                  "num_tokens": 394349.0,
         | 
| 325 | 
            +
                  "step": 350
         | 
| 326 | 
            +
                },
         | 
| 327 | 
            +
                {
         | 
| 328 | 
            +
                  "epoch": 1.8947368421052633,
         | 
| 329 | 
            +
                  "grad_norm": 3.434131622314453,
         | 
| 330 | 
            +
                  "learning_rate": 7.4035087719298246e-06,
         | 
| 331 | 
            +
                  "loss": 0.7968,
         | 
| 332 | 
            +
                  "mean_token_accuracy": 0.7612445116043091,
         | 
| 333 | 
            +
                  "num_tokens": 405857.0,
         | 
| 334 | 
            +
                  "step": 360
         | 
| 335 | 
            +
                },
         | 
| 336 | 
            +
                {
         | 
| 337 | 
            +
                  "epoch": 1.9473684210526314,
         | 
| 338 | 
            +
                  "grad_norm": 4.3988871574401855,
         | 
| 339 | 
            +
                  "learning_rate": 7.052631578947369e-06,
         | 
| 340 | 
            +
                  "loss": 0.7902,
         | 
| 341 | 
            +
                  "mean_token_accuracy": 0.7641420245170594,
         | 
| 342 | 
            +
                  "num_tokens": 416942.0,
         | 
| 343 | 
            +
                  "step": 370
         | 
| 344 | 
            +
                },
         | 
| 345 | 
            +
                {
         | 
| 346 | 
            +
                  "epoch": 2.0,
         | 
| 347 | 
            +
                  "grad_norm": 4.871621131896973,
         | 
| 348 | 
            +
                  "learning_rate": 6.701754385964913e-06,
         | 
| 349 | 
            +
                  "loss": 0.7826,
         | 
| 350 | 
            +
                  "mean_token_accuracy": 0.763578999042511,
         | 
| 351 | 
            +
                  "num_tokens": 428140.0,
         | 
| 352 | 
            +
                  "step": 380
         | 
| 353 | 
            +
                },
         | 
| 354 | 
            +
                {
         | 
| 355 | 
            +
                  "epoch": 2.0526315789473686,
         | 
| 356 | 
            +
                  "grad_norm": 3.33593487739563,
         | 
| 357 | 
            +
                  "learning_rate": 6.3508771929824565e-06,
         | 
| 358 | 
            +
                  "loss": 0.3547,
         | 
| 359 | 
            +
                  "mean_token_accuracy": 0.8930943846702576,
         | 
| 360 | 
            +
                  "num_tokens": 439280.0,
         | 
| 361 | 
            +
                  "step": 390
         | 
| 362 | 
            +
                },
         | 
| 363 | 
            +
                {
         | 
| 364 | 
            +
                  "epoch": 2.1052631578947367,
         | 
| 365 | 
            +
                  "grad_norm": 3.4659924507141113,
         | 
| 366 | 
            +
                  "learning_rate": 6e-06,
         | 
| 367 | 
            +
                  "loss": 0.3274,
         | 
| 368 | 
            +
                  "mean_token_accuracy": 0.8942540287971497,
         | 
| 369 | 
            +
                  "num_tokens": 450443.0,
         | 
| 370 | 
            +
                  "step": 400
         | 
| 371 | 
            +
                },
         | 
| 372 | 
            +
                {
         | 
| 373 | 
            +
                  "epoch": 2.1578947368421053,
         | 
| 374 | 
            +
                  "grad_norm": 3.334516763687134,
         | 
| 375 | 
            +
                  "learning_rate": 5.649122807017544e-06,
         | 
| 376 | 
            +
                  "loss": 0.2965,
         | 
| 377 | 
            +
                  "mean_token_accuracy": 0.9032998859882355,
         | 
| 378 | 
            +
                  "num_tokens": 461774.0,
         | 
| 379 | 
            +
                  "step": 410
         | 
| 380 | 
            +
                },
         | 
| 381 | 
            +
                {
         | 
| 382 | 
            +
                  "epoch": 2.2105263157894735,
         | 
| 383 | 
            +
                  "grad_norm": 3.0437450408935547,
         | 
| 384 | 
            +
                  "learning_rate": 5.298245614035088e-06,
         | 
| 385 | 
            +
                  "loss": 0.2991,
         | 
| 386 | 
            +
                  "mean_token_accuracy": 0.9052271246910095,
         | 
| 387 | 
            +
                  "num_tokens": 473068.0,
         | 
| 388 | 
            +
                  "step": 420
         | 
| 389 | 
            +
                },
         | 
| 390 | 
            +
                {
         | 
| 391 | 
            +
                  "epoch": 2.263157894736842,
         | 
| 392 | 
            +
                  "grad_norm": 3.4685380458831787,
         | 
| 393 | 
            +
                  "learning_rate": 4.947368421052632e-06,
         | 
| 394 | 
            +
                  "loss": 0.3096,
         | 
| 395 | 
            +
                  "mean_token_accuracy": 0.900404566526413,
         | 
| 396 | 
            +
                  "num_tokens": 484122.0,
         | 
| 397 | 
            +
                  "step": 430
         | 
| 398 | 
            +
                },
         | 
| 399 | 
            +
                {
         | 
| 400 | 
            +
                  "epoch": 2.3157894736842106,
         | 
| 401 | 
            +
                  "grad_norm": 3.8948161602020264,
         | 
| 402 | 
            +
                  "learning_rate": 4.596491228070176e-06,
         | 
| 403 | 
            +
                  "loss": 0.2997,
         | 
| 404 | 
            +
                  "mean_token_accuracy": 0.901084166765213,
         | 
| 405 | 
            +
                  "num_tokens": 495492.0,
         | 
| 406 | 
            +
                  "step": 440
         | 
| 407 | 
            +
                },
         | 
| 408 | 
            +
                {
         | 
| 409 | 
            +
                  "epoch": 2.3684210526315788,
         | 
| 410 | 
            +
                  "grad_norm": 3.75742506980896,
         | 
| 411 | 
            +
                  "learning_rate": 4.245614035087719e-06,
         | 
| 412 | 
            +
                  "loss": 0.3066,
         | 
| 413 | 
            +
                  "mean_token_accuracy": 0.901238763332367,
         | 
| 414 | 
            +
                  "num_tokens": 507138.0,
         | 
| 415 | 
            +
                  "step": 450
         | 
| 416 | 
            +
                },
         | 
| 417 | 
            +
                {
         | 
| 418 | 
            +
                  "epoch": 2.4210526315789473,
         | 
| 419 | 
            +
                  "grad_norm": 3.1433157920837402,
         | 
| 420 | 
            +
                  "learning_rate": 3.894736842105263e-06,
         | 
| 421 | 
            +
                  "loss": 0.3153,
         | 
| 422 | 
            +
                  "mean_token_accuracy": 0.899943882226944,
         | 
| 423 | 
            +
                  "num_tokens": 518246.0,
         | 
| 424 | 
            +
                  "step": 460
         | 
| 425 | 
            +
                },
         | 
| 426 | 
            +
                {
         | 
| 427 | 
            +
                  "epoch": 2.473684210526316,
         | 
| 428 | 
            +
                  "grad_norm": 3.025437831878662,
         | 
| 429 | 
            +
                  "learning_rate": 3.543859649122807e-06,
         | 
| 430 | 
            +
                  "loss": 0.2975,
         | 
| 431 | 
            +
                  "mean_token_accuracy": 0.9012304186820984,
         | 
| 432 | 
            +
                  "num_tokens": 529617.0,
         | 
| 433 | 
            +
                  "step": 470
         | 
| 434 | 
            +
                },
         | 
| 435 | 
            +
                {
         | 
| 436 | 
            +
                  "epoch": 2.526315789473684,
         | 
| 437 | 
            +
                  "grad_norm": 3.873894453048706,
         | 
| 438 | 
            +
                  "learning_rate": 3.192982456140351e-06,
         | 
| 439 | 
            +
                  "loss": 0.2858,
         | 
| 440 | 
            +
                  "mean_token_accuracy": 0.9065197288990021,
         | 
| 441 | 
            +
                  "num_tokens": 540576.0,
         | 
| 442 | 
            +
                  "step": 480
         | 
| 443 | 
            +
                },
         | 
| 444 | 
            +
                {
         | 
| 445 | 
            +
                  "epoch": 2.5789473684210527,
         | 
| 446 | 
            +
                  "grad_norm": 3.2669241428375244,
         | 
| 447 | 
            +
                  "learning_rate": 2.842105263157895e-06,
         | 
| 448 | 
            +
                  "loss": 0.2911,
         | 
| 449 | 
            +
                  "mean_token_accuracy": 0.9069616377353669,
         | 
| 450 | 
            +
                  "num_tokens": 552038.0,
         | 
| 451 | 
            +
                  "step": 490
         | 
| 452 | 
            +
                },
         | 
| 453 | 
            +
                {
         | 
| 454 | 
            +
                  "epoch": 2.6315789473684212,
         | 
| 455 | 
            +
                  "grad_norm": 3.070552110671997,
         | 
| 456 | 
            +
                  "learning_rate": 2.4912280701754387e-06,
         | 
| 457 | 
            +
                  "loss": 0.2875,
         | 
| 458 | 
            +
                  "mean_token_accuracy": 0.9050689458847045,
         | 
| 459 | 
            +
                  "num_tokens": 563396.0,
         | 
| 460 | 
            +
                  "step": 500
         | 
| 461 | 
            +
                },
         | 
| 462 | 
            +
                {
         | 
| 463 | 
            +
                  "epoch": 2.6842105263157894,
         | 
| 464 | 
            +
                  "grad_norm": 3.8535304069519043,
         | 
| 465 | 
            +
                  "learning_rate": 2.1403508771929828e-06,
         | 
| 466 | 
            +
                  "loss": 0.3166,
         | 
| 467 | 
            +
                  "mean_token_accuracy": 0.8962113857269287,
         | 
| 468 | 
            +
                  "num_tokens": 574901.0,
         | 
| 469 | 
            +
                  "step": 510
         | 
| 470 | 
            +
                },
         | 
| 471 | 
            +
                {
         | 
| 472 | 
            +
                  "epoch": 2.736842105263158,
         | 
| 473 | 
            +
                  "grad_norm": 3.335392951965332,
         | 
| 474 | 
            +
                  "learning_rate": 1.7894736842105265e-06,
         | 
| 475 | 
            +
                  "loss": 0.2991,
         | 
| 476 | 
            +
                  "mean_token_accuracy": 0.9042251408100128,
         | 
| 477 | 
            +
                  "num_tokens": 586440.0,
         | 
| 478 | 
            +
                  "step": 520
         | 
| 479 | 
            +
                },
         | 
| 480 | 
            +
                {
         | 
| 481 | 
            +
                  "epoch": 2.7894736842105265,
         | 
| 482 | 
            +
                  "grad_norm": 3.1564080715179443,
         | 
| 483 | 
            +
                  "learning_rate": 1.4385964912280704e-06,
         | 
| 484 | 
            +
                  "loss": 0.2834,
         | 
| 485 | 
            +
                  "mean_token_accuracy": 0.9084087669849396,
         | 
| 486 | 
            +
                  "num_tokens": 597950.0,
         | 
| 487 | 
            +
                  "step": 530
         | 
| 488 | 
            +
                },
         | 
| 489 | 
            +
                {
         | 
| 490 | 
            +
                  "epoch": 2.8421052631578947,
         | 
| 491 | 
            +
                  "grad_norm": 3.4902498722076416,
         | 
| 492 | 
            +
                  "learning_rate": 1.0877192982456142e-06,
         | 
| 493 | 
            +
                  "loss": 0.296,
         | 
| 494 | 
            +
                  "mean_token_accuracy": 0.904336941242218,
         | 
| 495 | 
            +
                  "num_tokens": 608908.0,
         | 
| 496 | 
            +
                  "step": 540
         | 
| 497 | 
            +
                },
         | 
| 498 | 
            +
                {
         | 
| 499 | 
            +
                  "epoch": 2.8947368421052633,
         | 
| 500 | 
            +
                  "grad_norm": 3.7762794494628906,
         | 
| 501 | 
            +
                  "learning_rate": 7.368421052631579e-07,
         | 
| 502 | 
            +
                  "loss": 0.2876,
         | 
| 503 | 
            +
                  "mean_token_accuracy": 0.9060073494911194,
         | 
| 504 | 
            +
                  "num_tokens": 619855.0,
         | 
| 505 | 
            +
                  "step": 550
         | 
| 506 | 
            +
                },
         | 
| 507 | 
            +
                {
         | 
| 508 | 
            +
                  "epoch": 2.9473684210526314,
         | 
| 509 | 
            +
                  "grad_norm": 3.163846969604492,
         | 
| 510 | 
            +
                  "learning_rate": 3.859649122807018e-07,
         | 
| 511 | 
            +
                  "loss": 0.2582,
         | 
| 512 | 
            +
                  "mean_token_accuracy": 0.9169736564159393,
         | 
| 513 | 
            +
                  "num_tokens": 631493.0,
         | 
| 514 | 
            +
                  "step": 560
         | 
| 515 | 
            +
                },
         | 
| 516 | 
            +
                {
         | 
| 517 | 
            +
                  "epoch": 3.0,
         | 
| 518 | 
            +
                  "grad_norm": 4.375433921813965,
         | 
| 519 | 
            +
                  "learning_rate": 3.508771929824562e-08,
         | 
| 520 | 
            +
                  "loss": 0.2803,
         | 
| 521 | 
            +
                  "mean_token_accuracy": 0.911361163854599,
         | 
| 522 | 
            +
                  "num_tokens": 642210.0,
         | 
| 523 | 
            +
                  "step": 570
         | 
| 524 | 
            +
                }
         | 
| 525 | 
            +
              ],
         | 
| 526 | 
            +
              "logging_steps": 10,
         | 
| 527 | 
            +
              "max_steps": 570,
         | 
| 528 | 
            +
              "num_input_tokens_seen": 0,
         | 
| 529 | 
            +
              "num_train_epochs": 3,
         | 
| 530 | 
            +
              "save_steps": 500,
         | 
| 531 | 
            +
              "stateful_callbacks": {
         | 
| 532 | 
            +
                "TrainerControl": {
         | 
| 533 | 
            +
                  "args": {
         | 
| 534 | 
            +
                    "should_epoch_stop": false,
         | 
| 535 | 
            +
                    "should_evaluate": false,
         | 
| 536 | 
            +
                    "should_log": false,
         | 
| 537 | 
            +
                    "should_save": true,
         | 
| 538 | 
            +
                    "should_training_stop": true
         | 
| 539 | 
            +
                  },
         | 
| 540 | 
            +
                  "attributes": {}
         | 
| 541 | 
            +
                }
         | 
| 542 | 
            +
              },
         | 
| 543 | 
            +
              "total_flos": 3.862484283808973e+16,
         | 
| 544 | 
            +
              "train_batch_size": 8,
         | 
| 545 | 
            +
              "trial_name": null,
         | 
| 546 | 
            +
              "trial_params": null
         | 
| 547 | 
            +
            }
         | 
    	
        checkpoint-570/training_args.bin
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:fffaee687bcf434da66f399553b6e3a647d1ffa26a027aa9caf4841c6cfe086c
         | 
| 3 | 
            +
            size 6161
         | 
    	
        checkpoint-570/vocab.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        config.json
    ADDED
    
    | @@ -0,0 +1,68 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "architectures": [
         | 
| 3 | 
            +
                "Qwen3ForCausalLM"
         | 
| 4 | 
            +
              ],
         | 
| 5 | 
            +
              "attention_bias": false,
         | 
| 6 | 
            +
              "attention_dropout": 0.0,
         | 
| 7 | 
            +
              "bos_token_id": 151643,
         | 
| 8 | 
            +
              "eos_token_id": 151645,
         | 
| 9 | 
            +
              "head_dim": 128,
         | 
| 10 | 
            +
              "hidden_act": "silu",
         | 
| 11 | 
            +
              "hidden_size": 4096,
         | 
| 12 | 
            +
              "initializer_range": 0.02,
         | 
| 13 | 
            +
              "intermediate_size": 12288,
         | 
| 14 | 
            +
              "layer_types": [
         | 
| 15 | 
            +
                "full_attention",
         | 
| 16 | 
            +
                "full_attention",
         | 
| 17 | 
            +
                "full_attention",
         | 
| 18 | 
            +
                "full_attention",
         | 
| 19 | 
            +
                "full_attention",
         | 
| 20 | 
            +
                "full_attention",
         | 
| 21 | 
            +
                "full_attention",
         | 
| 22 | 
            +
                "full_attention",
         | 
| 23 | 
            +
                "full_attention",
         | 
| 24 | 
            +
                "full_attention",
         | 
| 25 | 
            +
                "full_attention",
         | 
| 26 | 
            +
                "full_attention",
         | 
| 27 | 
            +
                "full_attention",
         | 
| 28 | 
            +
                "full_attention",
         | 
| 29 | 
            +
                "full_attention",
         | 
| 30 | 
            +
                "full_attention",
         | 
| 31 | 
            +
                "full_attention",
         | 
| 32 | 
            +
                "full_attention",
         | 
| 33 | 
            +
                "full_attention",
         | 
| 34 | 
            +
                "full_attention",
         | 
| 35 | 
            +
                "full_attention",
         | 
| 36 | 
            +
                "full_attention",
         | 
| 37 | 
            +
                "full_attention",
         | 
| 38 | 
            +
                "full_attention",
         | 
| 39 | 
            +
                "full_attention",
         | 
| 40 | 
            +
                "full_attention",
         | 
| 41 | 
            +
                "full_attention",
         | 
| 42 | 
            +
                "full_attention",
         | 
| 43 | 
            +
                "full_attention",
         | 
| 44 | 
            +
                "full_attention",
         | 
| 45 | 
            +
                "full_attention",
         | 
| 46 | 
            +
                "full_attention",
         | 
| 47 | 
            +
                "full_attention",
         | 
| 48 | 
            +
                "full_attention",
         | 
| 49 | 
            +
                "full_attention",
         | 
| 50 | 
            +
                "full_attention"
         | 
| 51 | 
            +
              ],
         | 
| 52 | 
            +
              "max_position_embeddings": 40960,
         | 
| 53 | 
            +
              "max_window_layers": 36,
         | 
| 54 | 
            +
              "model_type": "qwen3",
         | 
| 55 | 
            +
              "num_attention_heads": 32,
         | 
| 56 | 
            +
              "num_hidden_layers": 36,
         | 
| 57 | 
            +
              "num_key_value_heads": 8,
         | 
| 58 | 
            +
              "rms_norm_eps": 1e-06,
         | 
| 59 | 
            +
              "rope_scaling": null,
         | 
| 60 | 
            +
              "rope_theta": 1000000,
         | 
| 61 | 
            +
              "sliding_window": null,
         | 
| 62 | 
            +
              "tie_word_embeddings": false,
         | 
| 63 | 
            +
              "torch_dtype": "float32",
         | 
| 64 | 
            +
              "transformers_version": "4.53.0",
         | 
| 65 | 
            +
              "use_cache": true,
         | 
| 66 | 
            +
              "use_sliding_window": false,
         | 
| 67 | 
            +
              "vocab_size": 151936
         | 
| 68 | 
            +
            }
         | 
    	
        generation_config.json
    ADDED
    
    | @@ -0,0 +1,13 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "bos_token_id": 151643,
         | 
| 3 | 
            +
              "do_sample": true,
         | 
| 4 | 
            +
              "eos_token_id": [
         | 
| 5 | 
            +
                151645,
         | 
| 6 | 
            +
                151643
         | 
| 7 | 
            +
              ],
         | 
| 8 | 
            +
              "pad_token_id": 151643,
         | 
| 9 | 
            +
              "temperature": 0.6,
         | 
| 10 | 
            +
              "top_k": 20,
         | 
| 11 | 
            +
              "top_p": 0.95,
         | 
| 12 | 
            +
              "transformers_version": "4.53.0"
         | 
| 13 | 
            +
            }
         | 
    	
        merges.txt
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        model-00001-of-00007.safetensors
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:2377228dac033fa3ab97988d1c941a87c54a9374043c8325613c46dee547ab60
         | 
| 3 | 
            +
            size 4972454376
         | 
    	
        model-00002-of-00007.safetensors
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:df68711b51929151c16b6c14f2f1d02763922cdf8e51bfee69c54093ff45f450
         | 
| 3 | 
            +
            size 4832048608
         | 
    	
        model-00003-of-00007.safetensors
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:db975b3aa0e55a6469664a6ce4b4663c61ed6f8eca00eeece7ef835d72bc428d
         | 
| 3 | 
            +
            size 4832048656
         | 
    	
        model-00006-of-00007.safetensors
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:4b824a72773ac74525a1d71004db221e1779ef5ee6e2f0f103385d986695043f
         | 
| 3 | 
            +
            size 4832048672
         | 
    	
        model-00007-of-00007.safetensors
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:81c183a54b2f94bab76c992036ec9d20e8edfcd7752711f5ae889cc69f5f36ba
         | 
| 3 | 
            +
            size 3462482728
         | 
    	
        model.safetensors.index.json
    ADDED
    
    | @@ -0,0 +1,407 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "metadata": {
         | 
| 3 | 
            +
                "total_parameters": 8190735360,
         | 
| 4 | 
            +
                "total_size": 32762941440
         | 
| 5 | 
            +
              },
         | 
| 6 | 
            +
              "weight_map": {
         | 
| 7 | 
            +
                "lm_head.weight": "model-00007-of-00007.safetensors",
         | 
| 8 | 
            +
                "model.embed_tokens.weight": "model-00001-of-00007.safetensors",
         | 
| 9 | 
            +
                "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors",
         | 
| 10 | 
            +
                "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 11 | 
            +
                "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 12 | 
            +
                "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 13 | 
            +
                "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
         | 
| 14 | 
            +
                "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00007.safetensors",
         | 
| 15 | 
            +
                "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 16 | 
            +
                "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 17 | 
            +
                "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00007.safetensors",
         | 
| 18 | 
            +
                "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 19 | 
            +
                "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 20 | 
            +
                "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors",
         | 
| 21 | 
            +
                "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 22 | 
            +
                "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 23 | 
            +
                "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 24 | 
            +
                "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
         | 
| 25 | 
            +
                "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00007.safetensors",
         | 
| 26 | 
            +
                "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 27 | 
            +
                "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 28 | 
            +
                "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00007.safetensors",
         | 
| 29 | 
            +
                "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 30 | 
            +
                "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 31 | 
            +
                "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 32 | 
            +
                "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 33 | 
            +
                "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 34 | 
            +
                "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 35 | 
            +
                "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 36 | 
            +
                "model.layers.10.self_attn.k_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 37 | 
            +
                "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 38 | 
            +
                "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 39 | 
            +
                "model.layers.10.self_attn.q_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 40 | 
            +
                "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 41 | 
            +
                "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 42 | 
            +
                "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 43 | 
            +
                "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 44 | 
            +
                "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 45 | 
            +
                "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 46 | 
            +
                "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 47 | 
            +
                "model.layers.11.self_attn.k_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 48 | 
            +
                "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 49 | 
            +
                "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 50 | 
            +
                "model.layers.11.self_attn.q_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 51 | 
            +
                "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 52 | 
            +
                "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 53 | 
            +
                "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 54 | 
            +
                "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 55 | 
            +
                "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 56 | 
            +
                "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 57 | 
            +
                "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 58 | 
            +
                "model.layers.12.self_attn.k_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 59 | 
            +
                "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 60 | 
            +
                "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 61 | 
            +
                "model.layers.12.self_attn.q_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 62 | 
            +
                "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 63 | 
            +
                "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 64 | 
            +
                "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 65 | 
            +
                "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 66 | 
            +
                "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 67 | 
            +
                "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 68 | 
            +
                "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 69 | 
            +
                "model.layers.13.self_attn.k_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 70 | 
            +
                "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 71 | 
            +
                "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 72 | 
            +
                "model.layers.13.self_attn.q_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 73 | 
            +
                "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 74 | 
            +
                "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 75 | 
            +
                "model.layers.14.input_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 76 | 
            +
                "model.layers.14.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 77 | 
            +
                "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 78 | 
            +
                "model.layers.14.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 79 | 
            +
                "model.layers.14.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 80 | 
            +
                "model.layers.14.self_attn.k_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 81 | 
            +
                "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 82 | 
            +
                "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 83 | 
            +
                "model.layers.14.self_attn.q_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 84 | 
            +
                "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 85 | 
            +
                "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 86 | 
            +
                "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 87 | 
            +
                "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 88 | 
            +
                "model.layers.15.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 89 | 
            +
                "model.layers.15.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 90 | 
            +
                "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 91 | 
            +
                "model.layers.15.self_attn.k_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 92 | 
            +
                "model.layers.15.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 93 | 
            +
                "model.layers.15.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 94 | 
            +
                "model.layers.15.self_attn.q_norm.weight": "model-00003-of-00007.safetensors",
         | 
| 95 | 
            +
                "model.layers.15.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 96 | 
            +
                "model.layers.15.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 97 | 
            +
                "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 98 | 
            +
                "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 99 | 
            +
                "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 100 | 
            +
                "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 101 | 
            +
                "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 102 | 
            +
                "model.layers.16.self_attn.k_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 103 | 
            +
                "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 104 | 
            +
                "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 105 | 
            +
                "model.layers.16.self_attn.q_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 106 | 
            +
                "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 107 | 
            +
                "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 108 | 
            +
                "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 109 | 
            +
                "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 110 | 
            +
                "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 111 | 
            +
                "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 112 | 
            +
                "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 113 | 
            +
                "model.layers.17.self_attn.k_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 114 | 
            +
                "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 115 | 
            +
                "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 116 | 
            +
                "model.layers.17.self_attn.q_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 117 | 
            +
                "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 118 | 
            +
                "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 119 | 
            +
                "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 120 | 
            +
                "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 121 | 
            +
                "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 122 | 
            +
                "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 123 | 
            +
                "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 124 | 
            +
                "model.layers.18.self_attn.k_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 125 | 
            +
                "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 126 | 
            +
                "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 127 | 
            +
                "model.layers.18.self_attn.q_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 128 | 
            +
                "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 129 | 
            +
                "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 130 | 
            +
                "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 131 | 
            +
                "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 132 | 
            +
                "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 133 | 
            +
                "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 134 | 
            +
                "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 135 | 
            +
                "model.layers.19.self_attn.k_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 136 | 
            +
                "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 137 | 
            +
                "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 138 | 
            +
                "model.layers.19.self_attn.q_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 139 | 
            +
                "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 140 | 
            +
                "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 141 | 
            +
                "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors",
         | 
| 142 | 
            +
                "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 143 | 
            +
                "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 144 | 
            +
                "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 145 | 
            +
                "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
         | 
| 146 | 
            +
                "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00007.safetensors",
         | 
| 147 | 
            +
                "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 148 | 
            +
                "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 149 | 
            +
                "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00007.safetensors",
         | 
| 150 | 
            +
                "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 151 | 
            +
                "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 152 | 
            +
                "model.layers.20.input_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 153 | 
            +
                "model.layers.20.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 154 | 
            +
                "model.layers.20.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 155 | 
            +
                "model.layers.20.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 156 | 
            +
                "model.layers.20.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 157 | 
            +
                "model.layers.20.self_attn.k_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 158 | 
            +
                "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 159 | 
            +
                "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 160 | 
            +
                "model.layers.20.self_attn.q_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 161 | 
            +
                "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 162 | 
            +
                "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 163 | 
            +
                "model.layers.21.input_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 164 | 
            +
                "model.layers.21.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 165 | 
            +
                "model.layers.21.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 166 | 
            +
                "model.layers.21.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 167 | 
            +
                "model.layers.21.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
         | 
| 168 | 
            +
                "model.layers.21.self_attn.k_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 169 | 
            +
                "model.layers.21.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 170 | 
            +
                "model.layers.21.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 171 | 
            +
                "model.layers.21.self_attn.q_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 172 | 
            +
                "model.layers.21.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 173 | 
            +
                "model.layers.21.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 174 | 
            +
                "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 175 | 
            +
                "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 176 | 
            +
                "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 177 | 
            +
                "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 178 | 
            +
                "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 179 | 
            +
                "model.layers.22.self_attn.k_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 180 | 
            +
                "model.layers.22.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 181 | 
            +
                "model.layers.22.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 182 | 
            +
                "model.layers.22.self_attn.q_norm.weight": "model-00004-of-00007.safetensors",
         | 
| 183 | 
            +
                "model.layers.22.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 184 | 
            +
                "model.layers.22.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
         | 
| 185 | 
            +
                "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 186 | 
            +
                "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 187 | 
            +
                "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 188 | 
            +
                "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 189 | 
            +
                "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 190 | 
            +
                "model.layers.23.self_attn.k_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 191 | 
            +
                "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 192 | 
            +
                "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 193 | 
            +
                "model.layers.23.self_attn.q_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 194 | 
            +
                "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 195 | 
            +
                "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 196 | 
            +
                "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 197 | 
            +
                "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 198 | 
            +
                "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 199 | 
            +
                "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 200 | 
            +
                "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 201 | 
            +
                "model.layers.24.self_attn.k_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 202 | 
            +
                "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 203 | 
            +
                "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 204 | 
            +
                "model.layers.24.self_attn.q_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 205 | 
            +
                "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 206 | 
            +
                "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 207 | 
            +
                "model.layers.25.input_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 208 | 
            +
                "model.layers.25.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 209 | 
            +
                "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 210 | 
            +
                "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 211 | 
            +
                "model.layers.25.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 212 | 
            +
                "model.layers.25.self_attn.k_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 213 | 
            +
                "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 214 | 
            +
                "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 215 | 
            +
                "model.layers.25.self_attn.q_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 216 | 
            +
                "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 217 | 
            +
                "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 218 | 
            +
                "model.layers.26.input_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 219 | 
            +
                "model.layers.26.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 220 | 
            +
                "model.layers.26.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 221 | 
            +
                "model.layers.26.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 222 | 
            +
                "model.layers.26.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 223 | 
            +
                "model.layers.26.self_attn.k_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 224 | 
            +
                "model.layers.26.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 225 | 
            +
                "model.layers.26.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 226 | 
            +
                "model.layers.26.self_attn.q_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 227 | 
            +
                "model.layers.26.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 228 | 
            +
                "model.layers.26.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 229 | 
            +
                "model.layers.27.input_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 230 | 
            +
                "model.layers.27.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 231 | 
            +
                "model.layers.27.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 232 | 
            +
                "model.layers.27.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 233 | 
            +
                "model.layers.27.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
         | 
| 234 | 
            +
                "model.layers.27.self_attn.k_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 235 | 
            +
                "model.layers.27.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 236 | 
            +
                "model.layers.27.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 237 | 
            +
                "model.layers.27.self_attn.q_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 238 | 
            +
                "model.layers.27.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 239 | 
            +
                "model.layers.27.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 240 | 
            +
                "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 241 | 
            +
                "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 242 | 
            +
                "model.layers.28.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 243 | 
            +
                "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 244 | 
            +
                "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 245 | 
            +
                "model.layers.28.self_attn.k_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 246 | 
            +
                "model.layers.28.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 247 | 
            +
                "model.layers.28.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 248 | 
            +
                "model.layers.28.self_attn.q_norm.weight": "model-00005-of-00007.safetensors",
         | 
| 249 | 
            +
                "model.layers.28.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 250 | 
            +
                "model.layers.28.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
         | 
| 251 | 
            +
                "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 252 | 
            +
                "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 253 | 
            +
                "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 254 | 
            +
                "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 255 | 
            +
                "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 256 | 
            +
                "model.layers.29.self_attn.k_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 257 | 
            +
                "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 258 | 
            +
                "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 259 | 
            +
                "model.layers.29.self_attn.q_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 260 | 
            +
                "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 261 | 
            +
                "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 262 | 
            +
                "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 263 | 
            +
                "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 264 | 
            +
                "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 265 | 
            +
                "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 266 | 
            +
                "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 267 | 
            +
                "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00007.safetensors",
         | 
| 268 | 
            +
                "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 269 | 
            +
                "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 270 | 
            +
                "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00007.safetensors",
         | 
| 271 | 
            +
                "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 272 | 
            +
                "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
         | 
| 273 | 
            +
                "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 274 | 
            +
                "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 275 | 
            +
                "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 276 | 
            +
                "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 277 | 
            +
                "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 278 | 
            +
                "model.layers.30.self_attn.k_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 279 | 
            +
                "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 280 | 
            +
                "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 281 | 
            +
                "model.layers.30.self_attn.q_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 282 | 
            +
                "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 283 | 
            +
                "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 284 | 
            +
                "model.layers.31.input_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 285 | 
            +
                "model.layers.31.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 286 | 
            +
                "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 287 | 
            +
                "model.layers.31.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 288 | 
            +
                "model.layers.31.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 289 | 
            +
                "model.layers.31.self_attn.k_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 290 | 
            +
                "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 291 | 
            +
                "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 292 | 
            +
                "model.layers.31.self_attn.q_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 293 | 
            +
                "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 294 | 
            +
                "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 295 | 
            +
                "model.layers.32.input_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 296 | 
            +
                "model.layers.32.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 297 | 
            +
                "model.layers.32.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 298 | 
            +
                "model.layers.32.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 299 | 
            +
                "model.layers.32.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 300 | 
            +
                "model.layers.32.self_attn.k_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 301 | 
            +
                "model.layers.32.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 302 | 
            +
                "model.layers.32.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 303 | 
            +
                "model.layers.32.self_attn.q_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 304 | 
            +
                "model.layers.32.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 305 | 
            +
                "model.layers.32.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 306 | 
            +
                "model.layers.33.input_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 307 | 
            +
                "model.layers.33.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 308 | 
            +
                "model.layers.33.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 309 | 
            +
                "model.layers.33.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 310 | 
            +
                "model.layers.33.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
         | 
| 311 | 
            +
                "model.layers.33.self_attn.k_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 312 | 
            +
                "model.layers.33.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 313 | 
            +
                "model.layers.33.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 314 | 
            +
                "model.layers.33.self_attn.q_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 315 | 
            +
                "model.layers.33.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 316 | 
            +
                "model.layers.33.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 317 | 
            +
                "model.layers.34.input_layernorm.weight": "model-00007-of-00007.safetensors",
         | 
| 318 | 
            +
                "model.layers.34.mlp.down_proj.weight": "model-00007-of-00007.safetensors",
         | 
| 319 | 
            +
                "model.layers.34.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 320 | 
            +
                "model.layers.34.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 321 | 
            +
                "model.layers.34.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
         | 
| 322 | 
            +
                "model.layers.34.self_attn.k_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 323 | 
            +
                "model.layers.34.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 324 | 
            +
                "model.layers.34.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 325 | 
            +
                "model.layers.34.self_attn.q_norm.weight": "model-00006-of-00007.safetensors",
         | 
| 326 | 
            +
                "model.layers.34.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 327 | 
            +
                "model.layers.34.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
         | 
| 328 | 
            +
                "model.layers.35.input_layernorm.weight": "model-00007-of-00007.safetensors",
         | 
| 329 | 
            +
                "model.layers.35.mlp.down_proj.weight": "model-00007-of-00007.safetensors",
         | 
| 330 | 
            +
                "model.layers.35.mlp.gate_proj.weight": "model-00007-of-00007.safetensors",
         | 
| 331 | 
            +
                "model.layers.35.mlp.up_proj.weight": "model-00007-of-00007.safetensors",
         | 
| 332 | 
            +
                "model.layers.35.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
         | 
| 333 | 
            +
                "model.layers.35.self_attn.k_norm.weight": "model-00007-of-00007.safetensors",
         | 
| 334 | 
            +
                "model.layers.35.self_attn.k_proj.weight": "model-00007-of-00007.safetensors",
         | 
| 335 | 
            +
                "model.layers.35.self_attn.o_proj.weight": "model-00007-of-00007.safetensors",
         | 
| 336 | 
            +
                "model.layers.35.self_attn.q_norm.weight": "model-00007-of-00007.safetensors",
         | 
| 337 | 
            +
                "model.layers.35.self_attn.q_proj.weight": "model-00007-of-00007.safetensors",
         | 
| 338 | 
            +
                "model.layers.35.self_attn.v_proj.weight": "model-00007-of-00007.safetensors",
         | 
| 339 | 
            +
                "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 340 | 
            +
                "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 341 | 
            +
                "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 342 | 
            +
                "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 343 | 
            +
                "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 344 | 
            +
                "model.layers.4.self_attn.k_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 345 | 
            +
                "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 346 | 
            +
                "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 347 | 
            +
                "model.layers.4.self_attn.q_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 348 | 
            +
                "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 349 | 
            +
                "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 350 | 
            +
                "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 351 | 
            +
                "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 352 | 
            +
                "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 353 | 
            +
                "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 354 | 
            +
                "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 355 | 
            +
                "model.layers.5.self_attn.k_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 356 | 
            +
                "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 357 | 
            +
                "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 358 | 
            +
                "model.layers.5.self_attn.q_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 359 | 
            +
                "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 360 | 
            +
                "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 361 | 
            +
                "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 362 | 
            +
                "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 363 | 
            +
                "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 364 | 
            +
                "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 365 | 
            +
                "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 366 | 
            +
                "model.layers.6.self_attn.k_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 367 | 
            +
                "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 368 | 
            +
                "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 369 | 
            +
                "model.layers.6.self_attn.q_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 370 | 
            +
                "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 371 | 
            +
                "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 372 | 
            +
                "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 373 | 
            +
                "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 374 | 
            +
                "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 375 | 
            +
                "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 376 | 
            +
                "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 377 | 
            +
                "model.layers.7.self_attn.k_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 378 | 
            +
                "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 379 | 
            +
                "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 380 | 
            +
                "model.layers.7.self_attn.q_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 381 | 
            +
                "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 382 | 
            +
                "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 383 | 
            +
                "model.layers.8.input_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 384 | 
            +
                "model.layers.8.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 385 | 
            +
                "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 386 | 
            +
                "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 387 | 
            +
                "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
         | 
| 388 | 
            +
                "model.layers.8.self_attn.k_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 389 | 
            +
                "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 390 | 
            +
                "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 391 | 
            +
                "model.layers.8.self_attn.q_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 392 | 
            +
                "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 393 | 
            +
                "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 394 | 
            +
                "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 395 | 
            +
                "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 396 | 
            +
                "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 397 | 
            +
                "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
         | 
| 398 | 
            +
                "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
         | 
| 399 | 
            +
                "model.layers.9.self_attn.k_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 400 | 
            +
                "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 401 | 
            +
                "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 402 | 
            +
                "model.layers.9.self_attn.q_norm.weight": "model-00002-of-00007.safetensors",
         | 
| 403 | 
            +
                "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 404 | 
            +
                "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
         | 
| 405 | 
            +
                "model.norm.weight": "model-00007-of-00007.safetensors"
         | 
| 406 | 
            +
              }
         | 
| 407 | 
            +
            }
         | 
    	
        special_tokens_map.json
    ADDED
    
    | @@ -0,0 +1,31 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "additional_special_tokens": [
         | 
| 3 | 
            +
                "<|im_start|>",
         | 
| 4 | 
            +
                "<|im_end|>",
         | 
| 5 | 
            +
                "<|object_ref_start|>",
         | 
| 6 | 
            +
                "<|object_ref_end|>",
         | 
| 7 | 
            +
                "<|box_start|>",
         | 
| 8 | 
            +
                "<|box_end|>",
         | 
| 9 | 
            +
                "<|quad_start|>",
         | 
| 10 | 
            +
                "<|quad_end|>",
         | 
| 11 | 
            +
                "<|vision_start|>",
         | 
| 12 | 
            +
                "<|vision_end|>",
         | 
| 13 | 
            +
                "<|vision_pad|>",
         | 
| 14 | 
            +
                "<|image_pad|>",
         | 
| 15 | 
            +
                "<|video_pad|>"
         | 
| 16 | 
            +
              ],
         | 
| 17 | 
            +
              "eos_token": {
         | 
| 18 | 
            +
                "content": "<|im_end|>",
         | 
| 19 | 
            +
                "lstrip": false,
         | 
| 20 | 
            +
                "normalized": false,
         | 
| 21 | 
            +
                "rstrip": false,
         | 
| 22 | 
            +
                "single_word": false
         | 
| 23 | 
            +
              },
         | 
| 24 | 
            +
              "pad_token": {
         | 
| 25 | 
            +
                "content": "<|endoftext|>",
         | 
| 26 | 
            +
                "lstrip": false,
         | 
| 27 | 
            +
                "normalized": false,
         | 
| 28 | 
            +
                "rstrip": false,
         | 
| 29 | 
            +
                "single_word": false
         | 
| 30 | 
            +
              }
         | 
| 31 | 
            +
            }
         | 
    	
        tokenizer.json
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
         | 
| 3 | 
            +
            size 11422654
         | 
    	
        tokenizer_config.json
    ADDED
    
    | @@ -0,0 +1,239 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "add_bos_token": false,
         | 
| 3 | 
            +
              "add_prefix_space": false,
         | 
| 4 | 
            +
              "added_tokens_decoder": {
         | 
| 5 | 
            +
                "151643": {
         | 
| 6 | 
            +
                  "content": "<|endoftext|>",
         | 
| 7 | 
            +
                  "lstrip": false,
         | 
| 8 | 
            +
                  "normalized": false,
         | 
| 9 | 
            +
                  "rstrip": false,
         | 
| 10 | 
            +
                  "single_word": false,
         | 
| 11 | 
            +
                  "special": true
         | 
| 12 | 
            +
                },
         | 
| 13 | 
            +
                "151644": {
         | 
| 14 | 
            +
                  "content": "<|im_start|>",
         | 
| 15 | 
            +
                  "lstrip": false,
         | 
| 16 | 
            +
                  "normalized": false,
         | 
| 17 | 
            +
                  "rstrip": false,
         | 
| 18 | 
            +
                  "single_word": false,
         | 
| 19 | 
            +
                  "special": true
         | 
| 20 | 
            +
                },
         | 
| 21 | 
            +
                "151645": {
         | 
| 22 | 
            +
                  "content": "<|im_end|>",
         | 
| 23 | 
            +
                  "lstrip": false,
         | 
| 24 | 
            +
                  "normalized": false,
         | 
| 25 | 
            +
                  "rstrip": false,
         | 
| 26 | 
            +
                  "single_word": false,
         | 
| 27 | 
            +
                  "special": true
         | 
| 28 | 
            +
                },
         | 
| 29 | 
            +
                "151646": {
         | 
| 30 | 
            +
                  "content": "<|object_ref_start|>",
         | 
| 31 | 
            +
                  "lstrip": false,
         | 
| 32 | 
            +
                  "normalized": false,
         | 
| 33 | 
            +
                  "rstrip": false,
         | 
| 34 | 
            +
                  "single_word": false,
         | 
| 35 | 
            +
                  "special": true
         | 
| 36 | 
            +
                },
         | 
| 37 | 
            +
                "151647": {
         | 
| 38 | 
            +
                  "content": "<|object_ref_end|>",
         | 
| 39 | 
            +
                  "lstrip": false,
         | 
| 40 | 
            +
                  "normalized": false,
         | 
| 41 | 
            +
                  "rstrip": false,
         | 
| 42 | 
            +
                  "single_word": false,
         | 
| 43 | 
            +
                  "special": true
         | 
| 44 | 
            +
                },
         | 
| 45 | 
            +
                "151648": {
         | 
| 46 | 
            +
                  "content": "<|box_start|>",
         | 
| 47 | 
            +
                  "lstrip": false,
         | 
| 48 | 
            +
                  "normalized": false,
         | 
| 49 | 
            +
                  "rstrip": false,
         | 
| 50 | 
            +
                  "single_word": false,
         | 
| 51 | 
            +
                  "special": true
         | 
| 52 | 
            +
                },
         | 
| 53 | 
            +
                "151649": {
         | 
| 54 | 
            +
                  "content": "<|box_end|>",
         | 
| 55 | 
            +
                  "lstrip": false,
         | 
| 56 | 
            +
                  "normalized": false,
         | 
| 57 | 
            +
                  "rstrip": false,
         | 
| 58 | 
            +
                  "single_word": false,
         | 
| 59 | 
            +
                  "special": true
         | 
| 60 | 
            +
                },
         | 
| 61 | 
            +
                "151650": {
         | 
| 62 | 
            +
                  "content": "<|quad_start|>",
         | 
| 63 | 
            +
                  "lstrip": false,
         | 
| 64 | 
            +
                  "normalized": false,
         | 
| 65 | 
            +
                  "rstrip": false,
         | 
| 66 | 
            +
                  "single_word": false,
         | 
| 67 | 
            +
                  "special": true
         | 
| 68 | 
            +
                },
         | 
| 69 | 
            +
                "151651": {
         | 
| 70 | 
            +
                  "content": "<|quad_end|>",
         | 
| 71 | 
            +
                  "lstrip": false,
         | 
| 72 | 
            +
                  "normalized": false,
         | 
| 73 | 
            +
                  "rstrip": false,
         | 
| 74 | 
            +
                  "single_word": false,
         | 
| 75 | 
            +
                  "special": true
         | 
| 76 | 
            +
                },
         | 
| 77 | 
            +
                "151652": {
         | 
| 78 | 
            +
                  "content": "<|vision_start|>",
         | 
| 79 | 
            +
                  "lstrip": false,
         | 
| 80 | 
            +
                  "normalized": false,
         | 
| 81 | 
            +
                  "rstrip": false,
         | 
| 82 | 
            +
                  "single_word": false,
         | 
| 83 | 
            +
                  "special": true
         | 
| 84 | 
            +
                },
         | 
| 85 | 
            +
                "151653": {
         | 
| 86 | 
            +
                  "content": "<|vision_end|>",
         | 
| 87 | 
            +
                  "lstrip": false,
         | 
| 88 | 
            +
                  "normalized": false,
         | 
| 89 | 
            +
                  "rstrip": false,
         | 
| 90 | 
            +
                  "single_word": false,
         | 
| 91 | 
            +
                  "special": true
         | 
| 92 | 
            +
                },
         | 
| 93 | 
            +
                "151654": {
         | 
| 94 | 
            +
                  "content": "<|vision_pad|>",
         | 
| 95 | 
            +
                  "lstrip": false,
         | 
| 96 | 
            +
                  "normalized": false,
         | 
| 97 | 
            +
                  "rstrip": false,
         | 
| 98 | 
            +
                  "single_word": false,
         | 
| 99 | 
            +
                  "special": true
         | 
| 100 | 
            +
                },
         | 
| 101 | 
            +
                "151655": {
         | 
| 102 | 
            +
                  "content": "<|image_pad|>",
         | 
| 103 | 
            +
                  "lstrip": false,
         | 
| 104 | 
            +
                  "normalized": false,
         | 
| 105 | 
            +
                  "rstrip": false,
         | 
| 106 | 
            +
                  "single_word": false,
         | 
| 107 | 
            +
                  "special": true
         | 
| 108 | 
            +
                },
         | 
| 109 | 
            +
                "151656": {
         | 
| 110 | 
            +
                  "content": "<|video_pad|>",
         | 
| 111 | 
            +
                  "lstrip": false,
         | 
| 112 | 
            +
                  "normalized": false,
         | 
| 113 | 
            +
                  "rstrip": false,
         | 
| 114 | 
            +
                  "single_word": false,
         | 
| 115 | 
            +
                  "special": true
         | 
| 116 | 
            +
                },
         | 
| 117 | 
            +
                "151657": {
         | 
| 118 | 
            +
                  "content": "<tool_call>",
         | 
| 119 | 
            +
                  "lstrip": false,
         | 
| 120 | 
            +
                  "normalized": false,
         | 
| 121 | 
            +
                  "rstrip": false,
         | 
| 122 | 
            +
                  "single_word": false,
         | 
| 123 | 
            +
                  "special": false
         | 
| 124 | 
            +
                },
         | 
| 125 | 
            +
                "151658": {
         | 
| 126 | 
            +
                  "content": "</tool_call>",
         | 
| 127 | 
            +
                  "lstrip": false,
         | 
| 128 | 
            +
                  "normalized": false,
         | 
| 129 | 
            +
                  "rstrip": false,
         | 
| 130 | 
            +
                  "single_word": false,
         | 
| 131 | 
            +
                  "special": false
         | 
| 132 | 
            +
                },
         | 
| 133 | 
            +
                "151659": {
         | 
| 134 | 
            +
                  "content": "<|fim_prefix|>",
         | 
| 135 | 
            +
                  "lstrip": false,
         | 
| 136 | 
            +
                  "normalized": false,
         | 
| 137 | 
            +
                  "rstrip": false,
         | 
| 138 | 
            +
                  "single_word": false,
         | 
| 139 | 
            +
                  "special": false
         | 
| 140 | 
            +
                },
         | 
| 141 | 
            +
                "151660": {
         | 
| 142 | 
            +
                  "content": "<|fim_middle|>",
         | 
| 143 | 
            +
                  "lstrip": false,
         | 
| 144 | 
            +
                  "normalized": false,
         | 
| 145 | 
            +
                  "rstrip": false,
         | 
| 146 | 
            +
                  "single_word": false,
         | 
| 147 | 
            +
                  "special": false
         | 
| 148 | 
            +
                },
         | 
| 149 | 
            +
                "151661": {
         | 
| 150 | 
            +
                  "content": "<|fim_suffix|>",
         | 
| 151 | 
            +
                  "lstrip": false,
         | 
| 152 | 
            +
                  "normalized": false,
         | 
| 153 | 
            +
                  "rstrip": false,
         | 
| 154 | 
            +
                  "single_word": false,
         | 
| 155 | 
            +
                  "special": false
         | 
| 156 | 
            +
                },
         | 
| 157 | 
            +
                "151662": {
         | 
| 158 | 
            +
                  "content": "<|fim_pad|>",
         | 
| 159 | 
            +
                  "lstrip": false,
         | 
| 160 | 
            +
                  "normalized": false,
         | 
| 161 | 
            +
                  "rstrip": false,
         | 
| 162 | 
            +
                  "single_word": false,
         | 
| 163 | 
            +
                  "special": false
         | 
| 164 | 
            +
                },
         | 
| 165 | 
            +
                "151663": {
         | 
| 166 | 
            +
                  "content": "<|repo_name|>",
         | 
| 167 | 
            +
                  "lstrip": false,
         | 
| 168 | 
            +
                  "normalized": false,
         | 
| 169 | 
            +
                  "rstrip": false,
         | 
| 170 | 
            +
                  "single_word": false,
         | 
| 171 | 
            +
                  "special": false
         | 
| 172 | 
            +
                },
         | 
| 173 | 
            +
                "151664": {
         | 
| 174 | 
            +
                  "content": "<|file_sep|>",
         | 
| 175 | 
            +
                  "lstrip": false,
         | 
| 176 | 
            +
                  "normalized": false,
         | 
| 177 | 
            +
                  "rstrip": false,
         | 
| 178 | 
            +
                  "single_word": false,
         | 
| 179 | 
            +
                  "special": false
         | 
| 180 | 
            +
                },
         | 
| 181 | 
            +
                "151665": {
         | 
| 182 | 
            +
                  "content": "<tool_response>",
         | 
| 183 | 
            +
                  "lstrip": false,
         | 
| 184 | 
            +
                  "normalized": false,
         | 
| 185 | 
            +
                  "rstrip": false,
         | 
| 186 | 
            +
                  "single_word": false,
         | 
| 187 | 
            +
                  "special": false
         | 
| 188 | 
            +
                },
         | 
| 189 | 
            +
                "151666": {
         | 
| 190 | 
            +
                  "content": "</tool_response>",
         | 
| 191 | 
            +
                  "lstrip": false,
         | 
| 192 | 
            +
                  "normalized": false,
         | 
| 193 | 
            +
                  "rstrip": false,
         | 
| 194 | 
            +
                  "single_word": false,
         | 
| 195 | 
            +
                  "special": false
         | 
| 196 | 
            +
                },
         | 
| 197 | 
            +
                "151667": {
         | 
| 198 | 
            +
                  "content": "<think>",
         | 
| 199 | 
            +
                  "lstrip": false,
         | 
| 200 | 
            +
                  "normalized": false,
         | 
| 201 | 
            +
                  "rstrip": false,
         | 
| 202 | 
            +
                  "single_word": false,
         | 
| 203 | 
            +
                  "special": false
         | 
| 204 | 
            +
                },
         | 
| 205 | 
            +
                "151668": {
         | 
| 206 | 
            +
                  "content": "</think>",
         | 
| 207 | 
            +
                  "lstrip": false,
         | 
| 208 | 
            +
                  "normalized": false,
         | 
| 209 | 
            +
                  "rstrip": false,
         | 
| 210 | 
            +
                  "single_word": false,
         | 
| 211 | 
            +
                  "special": false
         | 
| 212 | 
            +
                }
         | 
| 213 | 
            +
              },
         | 
| 214 | 
            +
              "additional_special_tokens": [
         | 
| 215 | 
            +
                "<|im_start|>",
         | 
| 216 | 
            +
                "<|im_end|>",
         | 
| 217 | 
            +
                "<|object_ref_start|>",
         | 
| 218 | 
            +
                "<|object_ref_end|>",
         | 
| 219 | 
            +
                "<|box_start|>",
         | 
| 220 | 
            +
                "<|box_end|>",
         | 
| 221 | 
            +
                "<|quad_start|>",
         | 
| 222 | 
            +
                "<|quad_end|>",
         | 
| 223 | 
            +
                "<|vision_start|>",
         | 
| 224 | 
            +
                "<|vision_end|>",
         | 
| 225 | 
            +
                "<|vision_pad|>",
         | 
| 226 | 
            +
                "<|image_pad|>",
         | 
| 227 | 
            +
                "<|video_pad|>"
         | 
| 228 | 
            +
              ],
         | 
| 229 | 
            +
              "bos_token": null,
         | 
| 230 | 
            +
              "clean_up_tokenization_spaces": false,
         | 
| 231 | 
            +
              "eos_token": "<|im_end|>",
         | 
| 232 | 
            +
              "errors": "replace",
         | 
| 233 | 
            +
              "extra_special_tokens": {},
         | 
| 234 | 
            +
              "model_max_length": 131072,
         | 
| 235 | 
            +
              "pad_token": "<|endoftext|>",
         | 
| 236 | 
            +
              "split_special_tokens": false,
         | 
| 237 | 
            +
              "tokenizer_class": "Qwen2Tokenizer",
         | 
| 238 | 
            +
              "unk_token": null
         | 
| 239 | 
            +
            }
         | 
    	
        training_args.bin
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:fffaee687bcf434da66f399553b6e3a647d1ffa26a027aa9caf4841c6cfe086c
         | 
| 3 | 
            +
            size 6161
         | 
