Text Generation
Transformers
Safetensors
Turkish
English
phi3
causal-lm
turkish
depth-up-scaling
net2net
syko
text-generation-inference
Instructions to use SykoSLM/SykoLLM-V6.0 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use SykoSLM/SykoLLM-V6.0 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="SykoSLM/SykoLLM-V6.0")# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("SykoSLM/SykoLLM-V6.0") model = AutoModelForCausalLM.from_pretrained("SykoSLM/SykoLLM-V6.0") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use SykoSLM/SykoLLM-V6.0 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "SykoSLM/SykoLLM-V6.0" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "SykoSLM/SykoLLM-V6.0", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/SykoSLM/SykoLLM-V6.0
- SGLang
How to use SykoSLM/SykoLLM-V6.0 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "SykoSLM/SykoLLM-V6.0" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "SykoSLM/SykoLLM-V6.0", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "SykoSLM/SykoLLM-V6.0" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "SykoSLM/SykoLLM-V6.0", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use SykoSLM/SykoLLM-V6.0 with Docker Model Runner:
docker model run hf.co/SykoSLM/SykoLLM-V6.0
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.35, | |
| "eval_steps": 500, | |
| "global_step": 2800, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00125, | |
| "grad_norm": 0.40701737999916077, | |
| "learning_rate": 6.417e-06, | |
| "loss": 1.9589118957519531, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0025, | |
| "grad_norm": 0.3704953193664551, | |
| "learning_rate": 1.3547e-05, | |
| "loss": 1.879218864440918, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.00375, | |
| "grad_norm": 0.34090375900268555, | |
| "learning_rate": 2.0677e-05, | |
| "loss": 1.8871658325195313, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.005, | |
| "grad_norm": 0.33982428908348083, | |
| "learning_rate": 2.7807e-05, | |
| "loss": 1.8348798751831055, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.00625, | |
| "grad_norm": 0.3448389172554016, | |
| "learning_rate": 3.4937e-05, | |
| "loss": 1.8976055145263673, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0075, | |
| "grad_norm": 0.3351344168186188, | |
| "learning_rate": 4.2066999999999996e-05, | |
| "loss": 1.8488676071166992, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.00875, | |
| "grad_norm": 0.33170202374458313, | |
| "learning_rate": 4.9197e-05, | |
| "loss": 1.8325592041015626, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.34600478410720825, | |
| "learning_rate": 5.6327e-05, | |
| "loss": 1.8475696563720703, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.01125, | |
| "grad_norm": 0.34344804286956787, | |
| "learning_rate": 6.3457e-05, | |
| "loss": 1.8463781356811524, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.0125, | |
| "grad_norm": 0.32425570487976074, | |
| "learning_rate": 7.0587e-05, | |
| "loss": 1.8811756134033204, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.01375, | |
| "grad_norm": 0.33838146924972534, | |
| "learning_rate": 7.7717e-05, | |
| "loss": 1.8498527526855468, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.015, | |
| "grad_norm": 0.34978190064430237, | |
| "learning_rate": 8.4847e-05, | |
| "loss": 1.7197338104248048, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.01625, | |
| "grad_norm": 0.3554218113422394, | |
| "learning_rate": 9.1977e-05, | |
| "loss": 1.7990310668945313, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.0175, | |
| "grad_norm": 0.3349857032299042, | |
| "learning_rate": 9.910699999999998e-05, | |
| "loss": 1.8458877563476563, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.01875, | |
| "grad_norm": 0.3333263099193573, | |
| "learning_rate": 0.00010623699999999999, | |
| "loss": 1.8082691192626954, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.3492045998573303, | |
| "learning_rate": 0.000113367, | |
| "loss": 1.7753177642822267, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.02125, | |
| "grad_norm": 0.33766260743141174, | |
| "learning_rate": 0.000120497, | |
| "loss": 1.7588382720947267, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.0225, | |
| "grad_norm": 0.3680027723312378, | |
| "learning_rate": 0.000127627, | |
| "loss": 1.7494930267333983, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.02375, | |
| "grad_norm": 0.35260000824928284, | |
| "learning_rate": 0.000134757, | |
| "loss": 1.758560562133789, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.025, | |
| "grad_norm": 0.3592912256717682, | |
| "learning_rate": 0.000141887, | |
| "loss": 1.8017724990844726, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.02625, | |
| "grad_norm": 0.34770476818084717, | |
| "learning_rate": 0.00014259953155930407, | |
| "loss": 1.8061519622802735, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.0275, | |
| "grad_norm": 0.358970582485199, | |
| "learning_rate": 0.00014259791226603537, | |
| "loss": 1.8515422821044922, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.02875, | |
| "grad_norm": 0.34490638971328735, | |
| "learning_rate": 0.00014259513636323773, | |
| "loss": 1.8080307006835938, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.3587310016155243, | |
| "learning_rate": 0.00014259120389594238, | |
| "loss": 1.8180580139160156, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.03125, | |
| "grad_norm": 0.35348573327064514, | |
| "learning_rate": 0.0001425861149279427, | |
| "loss": 1.822945785522461, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.0325, | |
| "grad_norm": 0.3408539891242981, | |
| "learning_rate": 0.00014257986954179292, | |
| "loss": 1.804990577697754, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.03375, | |
| "grad_norm": 0.35097193717956543, | |
| "learning_rate": 0.00014257246783880696, | |
| "loss": 1.8341880798339845, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.035, | |
| "grad_norm": 0.3467462956905365, | |
| "learning_rate": 0.00014256390993905687, | |
| "loss": 1.7296785354614257, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.03625, | |
| "grad_norm": 0.3492400050163269, | |
| "learning_rate": 0.00014255419598137062, | |
| "loss": 1.8266151428222657, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.0375, | |
| "grad_norm": 0.3718615472316742, | |
| "learning_rate": 0.00014254332612333005, | |
| "loss": 1.7514339447021485, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.03875, | |
| "grad_norm": 0.3476354479789734, | |
| "learning_rate": 0.00014253130054126827, | |
| "loss": 1.8226016998291015, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.34655508399009705, | |
| "learning_rate": 0.00014251811943026674, | |
| "loss": 1.8513336181640625, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.04125, | |
| "grad_norm": 0.3519170880317688, | |
| "learning_rate": 0.00014250378300415223, | |
| "loss": 1.864480972290039, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.0425, | |
| "grad_norm": 0.3491443395614624, | |
| "learning_rate": 0.00014248829149549318, | |
| "loss": 1.8030773162841798, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.04375, | |
| "grad_norm": 0.3646671175956726, | |
| "learning_rate": 0.00014247164515559605, | |
| "loss": 1.782710647583008, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.045, | |
| "grad_norm": 0.3525862395763397, | |
| "learning_rate": 0.00014245384425450123, | |
| "loss": 1.8301689147949218, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.04625, | |
| "grad_norm": 0.3430674970149994, | |
| "learning_rate": 0.00014243488908097866, | |
| "loss": 1.7636734008789063, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.0475, | |
| "grad_norm": 0.3655545115470886, | |
| "learning_rate": 0.00014241477994252308, | |
| "loss": 1.8431385040283204, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.04875, | |
| "grad_norm": 0.35655322670936584, | |
| "learning_rate": 0.00014239351716534906, | |
| "loss": 1.8405876159667969, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.3450303077697754, | |
| "learning_rate": 0.00014237110109438587, | |
| "loss": 1.7880744934082031, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.05125, | |
| "grad_norm": 0.36362725496292114, | |
| "learning_rate": 0.0001423475320932716, | |
| "loss": 1.803448486328125, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.0525, | |
| "grad_norm": 0.3608654737472534, | |
| "learning_rate": 0.0001423228105443475, | |
| "loss": 1.7959218978881837, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.05375, | |
| "grad_norm": 0.3524814248085022, | |
| "learning_rate": 0.00014229693684865167, | |
| "loss": 1.8105106353759766, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.055, | |
| "grad_norm": 0.35871171951293945, | |
| "learning_rate": 0.0001422699114259126, | |
| "loss": 1.7514846801757813, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.05625, | |
| "grad_norm": 0.3381369709968567, | |
| "learning_rate": 0.00014224173471454223, | |
| "loss": 1.811713981628418, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.0575, | |
| "grad_norm": 0.3746880292892456, | |
| "learning_rate": 0.00014221240717162908, | |
| "loss": 1.7895519256591796, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.05875, | |
| "grad_norm": 0.35921189188957214, | |
| "learning_rate": 0.00014218192927293062, | |
| "loss": 1.7877384185791017, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.3727467656135559, | |
| "learning_rate": 0.00014215030151286563, | |
| "loss": 1.8092086791992188, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.06125, | |
| "grad_norm": 0.36004638671875, | |
| "learning_rate": 0.00014211752440450624, | |
| "loss": 1.845526123046875, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.0625, | |
| "grad_norm": 0.34500977396965027, | |
| "learning_rate": 0.00014208359847956947, | |
| "loss": 1.793890380859375, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.06375, | |
| "grad_norm": 0.3571811020374298, | |
| "learning_rate": 0.00014204852428840873, | |
| "loss": 1.8021648406982422, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.065, | |
| "grad_norm": 0.3511386513710022, | |
| "learning_rate": 0.0001420123024000048, | |
| "loss": 1.7810476303100586, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.06625, | |
| "grad_norm": 0.3544309139251709, | |
| "learning_rate": 0.00014197493340195673, | |
| "loss": 1.782750701904297, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.0675, | |
| "grad_norm": 0.35211437940597534, | |
| "learning_rate": 0.00014193641790047207, | |
| "loss": 1.8397369384765625, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.06875, | |
| "grad_norm": 0.3561457097530365, | |
| "learning_rate": 0.00014189675652035737, | |
| "loss": 1.806086540222168, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.3514038026332855, | |
| "learning_rate": 0.0001418559499050077, | |
| "loss": 1.7963085174560547, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.07125, | |
| "grad_norm": 0.35221120715141296, | |
| "learning_rate": 0.00014181399871639652, | |
| "loss": 1.777400016784668, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.0725, | |
| "grad_norm": 0.34728357195854187, | |
| "learning_rate": 0.00014177090363506466, | |
| "loss": 1.7832159042358398, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.07375, | |
| "grad_norm": 0.35810062289237976, | |
| "learning_rate": 0.00014172666536010946, | |
| "loss": 1.7859878540039062, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.075, | |
| "grad_norm": 0.3402475118637085, | |
| "learning_rate": 0.00014168128460917344, | |
| "loss": 1.8559268951416015, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.07625, | |
| "grad_norm": 0.36799490451812744, | |
| "learning_rate": 0.00014163476211843254, | |
| "loss": 1.8264755249023437, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.0775, | |
| "grad_norm": 0.3646862804889679, | |
| "learning_rate": 0.00014158709864258424, | |
| "loss": 1.800428581237793, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.07875, | |
| "grad_norm": 0.37956395745277405, | |
| "learning_rate": 0.00014153829495483538, | |
| "loss": 1.7767526626586914, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.3566032648086548, | |
| "learning_rate": 0.00014148835184688949, | |
| "loss": 1.8472091674804687, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.08125, | |
| "grad_norm": 0.333779513835907, | |
| "learning_rate": 0.000141437270128934, | |
| "loss": 1.8140777587890624, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.0825, | |
| "grad_norm": 0.3429010212421417, | |
| "learning_rate": 0.0001413850506296272, | |
| "loss": 1.8366750717163085, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.08375, | |
| "grad_norm": 0.3753111660480499, | |
| "learning_rate": 0.00014133169419608456, | |
| "loss": 1.760198211669922, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.085, | |
| "grad_norm": 0.35503339767456055, | |
| "learning_rate": 0.0001412772016938653, | |
| "loss": 1.8173086166381835, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.08625, | |
| "grad_norm": 0.358216792345047, | |
| "learning_rate": 0.0001412215740069581, | |
| "loss": 1.7937744140625, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.0875, | |
| "grad_norm": 0.3600156605243683, | |
| "learning_rate": 0.00014116481203776677, | |
| "loss": 1.7986185073852539, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.08875, | |
| "grad_norm": 0.3507816195487976, | |
| "learning_rate": 0.00014110691670709584, | |
| "loss": 1.7555866241455078, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.35459256172180176, | |
| "learning_rate": 0.00014104788895413529, | |
| "loss": 1.795433807373047, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.09125, | |
| "grad_norm": 0.35286569595336914, | |
| "learning_rate": 0.00014098772973644564, | |
| "loss": 1.820347213745117, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.0925, | |
| "grad_norm": 0.3857751786708832, | |
| "learning_rate": 0.00014092644002994218, | |
| "loss": 1.8153291702270509, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.09375, | |
| "grad_norm": 0.3553074598312378, | |
| "learning_rate": 0.00014086402082887924, | |
| "loss": 1.8413051605224608, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.095, | |
| "grad_norm": 0.35642898082733154, | |
| "learning_rate": 0.0001408004731458341, | |
| "loss": 1.7815227508544922, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.09625, | |
| "grad_norm": 0.37263238430023193, | |
| "learning_rate": 0.00014073579801169043, | |
| "loss": 1.8360301971435546, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.0975, | |
| "grad_norm": 0.37507593631744385, | |
| "learning_rate": 0.00014066999647562167, | |
| "loss": 1.8166229248046875, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.09875, | |
| "grad_norm": 0.3496163487434387, | |
| "learning_rate": 0.00014060306960507398, | |
| "loss": 1.7876134872436524, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.350668340921402, | |
| "learning_rate": 0.000140535018485749, | |
| "loss": 1.8262884140014648, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.10125, | |
| "grad_norm": 0.36257749795913696, | |
| "learning_rate": 0.00014046584422158602, | |
| "loss": 1.7791305541992188, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.1025, | |
| "grad_norm": 0.357570081949234, | |
| "learning_rate": 0.00014039554793474442, | |
| "loss": 1.8329212188720703, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.10375, | |
| "grad_norm": 0.354640930891037, | |
| "learning_rate": 0.00014032413076558507, | |
| "loss": 1.7825984954833984, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.105, | |
| "grad_norm": 0.35969364643096924, | |
| "learning_rate": 0.00014025159387265215, | |
| "loss": 1.7961544036865233, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.10625, | |
| "grad_norm": 0.3408399224281311, | |
| "learning_rate": 0.00014017793843265416, | |
| "loss": 1.8031917572021485, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.1075, | |
| "grad_norm": 0.3505636751651764, | |
| "learning_rate": 0.00014010316564044495, | |
| "loss": 1.8270240783691407, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.10875, | |
| "grad_norm": 0.3612024784088135, | |
| "learning_rate": 0.00014002727670900427, | |
| "loss": 1.8037662506103516, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.3611273467540741, | |
| "learning_rate": 0.00013995027286941813, | |
| "loss": 1.7805574417114258, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.11125, | |
| "grad_norm": 0.370518296957016, | |
| "learning_rate": 0.00013987215537085876, | |
| "loss": 1.83743896484375, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.1125, | |
| "grad_norm": 0.3627995550632477, | |
| "learning_rate": 0.00013979292548056446, | |
| "loss": 1.8568729400634765, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.11375, | |
| "grad_norm": 0.33446118235588074, | |
| "learning_rate": 0.00013971258448381896, | |
| "loss": 1.8121458053588868, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.115, | |
| "grad_norm": 0.35702356696128845, | |
| "learning_rate": 0.00013963113368393058, | |
| "loss": 1.8272817611694336, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.11625, | |
| "grad_norm": 0.35480058193206787, | |
| "learning_rate": 0.00013954857440221107, | |
| "loss": 1.8286819458007812, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.1175, | |
| "grad_norm": 0.33891281485557556, | |
| "learning_rate": 0.00013946490797795425, | |
| "loss": 1.7881786346435546, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.11875, | |
| "grad_norm": 0.34998786449432373, | |
| "learning_rate": 0.00013938013576841426, | |
| "loss": 1.8192798614501953, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.36356785893440247, | |
| "learning_rate": 0.0001392942591487834, | |
| "loss": 1.8080211639404298, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.12125, | |
| "grad_norm": 0.3536245822906494, | |
| "learning_rate": 0.00013920727951217003, | |
| "loss": 1.7745712280273438, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.1225, | |
| "grad_norm": 0.35819944739341736, | |
| "learning_rate": 0.00013911919826957588, | |
| "loss": 1.8335809707641602, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.12375, | |
| "grad_norm": 0.3673238754272461, | |
| "learning_rate": 0.0001390300168498732, | |
| "loss": 1.7918657302856444, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.125, | |
| "grad_norm": 0.37633419036865234, | |
| "learning_rate": 0.0001389397366997814, | |
| "loss": 1.7912788391113281, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.12625, | |
| "grad_norm": 0.36260703206062317, | |
| "learning_rate": 0.00013884835928384387, | |
| "loss": 1.7769220352172852, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.1275, | |
| "grad_norm": 0.3502698242664337, | |
| "learning_rate": 0.00013875588608440397, | |
| "loss": 1.8571086883544923, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.12875, | |
| "grad_norm": 0.37244319915771484, | |
| "learning_rate": 0.0001386623186015812, | |
| "loss": 1.7873695373535157, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.36906760931015015, | |
| "learning_rate": 0.00013856765835324657, | |
| "loss": 1.7982921600341797, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.13125, | |
| "grad_norm": 0.3458193838596344, | |
| "learning_rate": 0.0001384719068749984, | |
| "loss": 1.896946907043457, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.1325, | |
| "grad_norm": 0.3625653088092804, | |
| "learning_rate": 0.00013837506572013695, | |
| "loss": 1.8590087890625, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.13375, | |
| "grad_norm": 0.37704798579216003, | |
| "learning_rate": 0.00013827713645963959, | |
| "loss": 1.7953170776367187, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.135, | |
| "grad_norm": 0.35103756189346313, | |
| "learning_rate": 0.00013817812068213505, | |
| "loss": 1.864565658569336, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.13625, | |
| "grad_norm": 0.39145445823669434, | |
| "learning_rate": 0.0001380780199938779, | |
| "loss": 1.787282371520996, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.1375, | |
| "grad_norm": 0.3810483515262604, | |
| "learning_rate": 0.00013797683601872218, | |
| "loss": 1.8461406707763672, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.13875, | |
| "grad_norm": 0.36001554131507874, | |
| "learning_rate": 0.00013787457039809542, | |
| "loss": 1.7846809387207032, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.36254000663757324, | |
| "learning_rate": 0.0001377712247909717, | |
| "loss": 1.8589000701904297, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.14125, | |
| "grad_norm": 0.3535791337490082, | |
| "learning_rate": 0.00013766680087384488, | |
| "loss": 1.790989875793457, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.1425, | |
| "grad_norm": 0.36819183826446533, | |
| "learning_rate": 0.00013756130034070147, | |
| "loss": 1.8115760803222656, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.14375, | |
| "grad_norm": 0.35042834281921387, | |
| "learning_rate": 0.00013745472490299298, | |
| "loss": 1.7872331619262696, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.145, | |
| "grad_norm": 0.36452701687812805, | |
| "learning_rate": 0.0001373470762896083, | |
| "loss": 1.8083602905273437, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.14625, | |
| "grad_norm": 0.35632047057151794, | |
| "learning_rate": 0.00013723835624684556, | |
| "loss": 1.8238039016723633, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.1475, | |
| "grad_norm": 0.36330121755599976, | |
| "learning_rate": 0.00013712856653838384, | |
| "loss": 1.8468303680419922, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.14875, | |
| "grad_norm": 0.37948107719421387, | |
| "learning_rate": 0.0001370177089452546, | |
| "loss": 1.7772663116455079, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.3759608268737793, | |
| "learning_rate": 0.0001369057852658127, | |
| "loss": 1.793960952758789, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.15125, | |
| "grad_norm": 0.3672516644001007, | |
| "learning_rate": 0.00013679279731570733, | |
| "loss": 1.7799537658691407, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.1525, | |
| "grad_norm": 0.3496241569519043, | |
| "learning_rate": 0.00013667874692785244, | |
| "loss": 1.7861103057861327, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.15375, | |
| "grad_norm": 0.3461642265319824, | |
| "learning_rate": 0.00013656363595239708, | |
| "loss": 1.8481361389160156, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.155, | |
| "grad_norm": 0.33858028054237366, | |
| "learning_rate": 0.0001364474662566954, | |
| "loss": 1.77642822265625, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.15625, | |
| "grad_norm": 0.3424132764339447, | |
| "learning_rate": 0.00013633023972527632, | |
| "loss": 1.7893180847167969, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.1575, | |
| "grad_norm": 0.35095998644828796, | |
| "learning_rate": 0.00013621195825981293, | |
| "loss": 1.7366466522216797, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.15875, | |
| "grad_norm": 0.36417317390441895, | |
| "learning_rate": 0.00013609262377909176, | |
| "loss": 1.839132308959961, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.3565835654735565, | |
| "learning_rate": 0.00013597223821898145, | |
| "loss": 1.757269287109375, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.16125, | |
| "grad_norm": 0.34676891565322876, | |
| "learning_rate": 0.00013585080353240158, | |
| "loss": 1.781381607055664, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.1625, | |
| "grad_norm": 0.3492533564567566, | |
| "learning_rate": 0.00013572832168929085, | |
| "loss": 1.8004392623901366, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.16375, | |
| "grad_norm": 0.33528923988342285, | |
| "learning_rate": 0.0001356047946765751, | |
| "loss": 1.7787307739257812, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.165, | |
| "grad_norm": 0.35009509325027466, | |
| "learning_rate": 0.00013548022449813522, | |
| "loss": 1.7703327178955077, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.16625, | |
| "grad_norm": 0.38126665353775024, | |
| "learning_rate": 0.00013535461317477446, | |
| "loss": 1.8216169357299805, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.1675, | |
| "grad_norm": 0.3653838038444519, | |
| "learning_rate": 0.00013522796274418575, | |
| "loss": 1.784686279296875, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.16875, | |
| "grad_norm": 0.35842376947402954, | |
| "learning_rate": 0.00013510027526091872, | |
| "loss": 1.818338394165039, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.3575061559677124, | |
| "learning_rate": 0.00013497155279634617, | |
| "loss": 1.8177734375, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.17125, | |
| "grad_norm": 0.36351051926612854, | |
| "learning_rate": 0.00013484179743863064, | |
| "loss": 1.8408927917480469, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.1725, | |
| "grad_norm": 0.37017935514450073, | |
| "learning_rate": 0.0001347110112926905, | |
| "loss": 1.8088676452636718, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.17375, | |
| "grad_norm": 0.35998839139938354, | |
| "learning_rate": 0.00013457919648016573, | |
| "loss": 1.8451946258544922, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.175, | |
| "grad_norm": 0.36173009872436523, | |
| "learning_rate": 0.0001344463551393836, | |
| "loss": 1.7784915924072267, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.17625, | |
| "grad_norm": 0.3683062493801117, | |
| "learning_rate": 0.00013431248942532385, | |
| "loss": 1.745309829711914, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.1775, | |
| "grad_norm": 0.3488103151321411, | |
| "learning_rate": 0.00013417760150958392, | |
| "loss": 1.793316650390625, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.17875, | |
| "grad_norm": 0.35314610600471497, | |
| "learning_rate": 0.00013404169358034355, | |
| "loss": 1.7867753982543946, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.3577822744846344, | |
| "learning_rate": 0.0001339047678423294, | |
| "loss": 1.7581512451171875, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.18125, | |
| "grad_norm": 0.3387848436832428, | |
| "learning_rate": 0.00013376682651677918, | |
| "loss": 1.7947473526000977, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.1825, | |
| "grad_norm": 0.3571684658527374, | |
| "learning_rate": 0.00013362787184140572, | |
| "loss": 1.7496719360351562, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.18375, | |
| "grad_norm": 0.3472369313240051, | |
| "learning_rate": 0.0001334879060703606, | |
| "loss": 1.7750968933105469, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.185, | |
| "grad_norm": 0.3559383749961853, | |
| "learning_rate": 0.00013334693147419759, | |
| "loss": 1.8256034851074219, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.18625, | |
| "grad_norm": 0.35892486572265625, | |
| "learning_rate": 0.00013320495033983585, | |
| "loss": 1.7993803024291992, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.1875, | |
| "grad_norm": 0.3679066300392151, | |
| "learning_rate": 0.0001330619649705228, | |
| "loss": 1.8065261840820312, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.18875, | |
| "grad_norm": 0.36252209544181824, | |
| "learning_rate": 0.0001329179776857968, | |
| "loss": 1.8372112274169923, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.3526136577129364, | |
| "learning_rate": 0.0001327729908214494, | |
| "loss": 1.799185562133789, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.19125, | |
| "grad_norm": 0.3635775148868561, | |
| "learning_rate": 0.0001326270067294877, | |
| "loss": 1.8340118408203125, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.1925, | |
| "grad_norm": 0.36545416712760925, | |
| "learning_rate": 0.00013248002777809586, | |
| "loss": 1.7582477569580077, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.19375, | |
| "grad_norm": 0.37526363134384155, | |
| "learning_rate": 0.00013233205635159695, | |
| "loss": 1.799554443359375, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.195, | |
| "grad_norm": 0.35140055418014526, | |
| "learning_rate": 0.0001321830948504142, | |
| "loss": 1.84625244140625, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.19625, | |
| "grad_norm": 0.3566315770149231, | |
| "learning_rate": 0.0001320331456910319, | |
| "loss": 1.7883316040039063, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.1975, | |
| "grad_norm": 0.35099372267723083, | |
| "learning_rate": 0.0001318822113059565, | |
| "loss": 1.794087028503418, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.19875, | |
| "grad_norm": 0.35940778255462646, | |
| "learning_rate": 0.00013173029414367693, | |
| "loss": 1.7220880508422851, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.36045801639556885, | |
| "learning_rate": 0.0001315773966686249, | |
| "loss": 1.7802143096923828, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.20125, | |
| "grad_norm": 0.3581635057926178, | |
| "learning_rate": 0.000131423521361135, | |
| "loss": 1.799722671508789, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.2025, | |
| "grad_norm": 0.33708855509757996, | |
| "learning_rate": 0.00013126867071740436, | |
| "loss": 1.8053092956542969, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.20375, | |
| "grad_norm": 0.3750436007976532, | |
| "learning_rate": 0.00013111284724945228, | |
| "loss": 1.8074203491210938, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.205, | |
| "grad_norm": 0.35119321942329407, | |
| "learning_rate": 0.0001309560534850794, | |
| "loss": 1.8175487518310547, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.20625, | |
| "grad_norm": 0.3611745834350586, | |
| "learning_rate": 0.00013079829196782668, | |
| "loss": 1.7702863693237305, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.2075, | |
| "grad_norm": 0.3799806833267212, | |
| "learning_rate": 0.00013063956525693424, | |
| "loss": 1.8235919952392579, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.20875, | |
| "grad_norm": 0.33240807056427, | |
| "learning_rate": 0.0001304798759272997, | |
| "loss": 1.768626594543457, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.36028313636779785, | |
| "learning_rate": 0.00013031922656943647, | |
| "loss": 1.829296875, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.21125, | |
| "grad_norm": 0.34874534606933594, | |
| "learning_rate": 0.00013015761978943185, | |
| "loss": 1.8018821716308593, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.2125, | |
| "grad_norm": 0.34944280982017517, | |
| "learning_rate": 0.00012999505820890448, | |
| "loss": 1.8226497650146485, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.21375, | |
| "grad_norm": 0.35128575563430786, | |
| "learning_rate": 0.00012983154446496209, | |
| "loss": 1.7741992950439454, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.215, | |
| "grad_norm": 0.3564985692501068, | |
| "learning_rate": 0.0001296670812101586, | |
| "loss": 1.7850433349609376, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.21625, | |
| "grad_norm": 0.3676067292690277, | |
| "learning_rate": 0.000129501671112451, | |
| "loss": 1.8290214538574219, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.2175, | |
| "grad_norm": 0.3726136386394501, | |
| "learning_rate": 0.00012933531685515627, | |
| "loss": 1.7774532318115235, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.21875, | |
| "grad_norm": 0.3493287265300751, | |
| "learning_rate": 0.00012916802113690766, | |
| "loss": 1.7807361602783203, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.37059202790260315, | |
| "learning_rate": 0.00012899978667161105, | |
| "loss": 1.749721145629883, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.22125, | |
| "grad_norm": 0.356022447347641, | |
| "learning_rate": 0.00012883061618840087, | |
| "loss": 1.8218292236328124, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.2225, | |
| "grad_norm": 0.3568074405193329, | |
| "learning_rate": 0.00012866051243159572, | |
| "loss": 1.8072574615478516, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.22375, | |
| "grad_norm": 0.3749092221260071, | |
| "learning_rate": 0.00012848947816065416, | |
| "loss": 1.8410078048706056, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.225, | |
| "grad_norm": 0.35633665323257446, | |
| "learning_rate": 0.00012831751615012955, | |
| "loss": 1.7817327499389648, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.22625, | |
| "grad_norm": 0.3607875108718872, | |
| "learning_rate": 0.00012814462918962533, | |
| "loss": 1.8118452072143554, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.2275, | |
| "grad_norm": 0.34315699338912964, | |
| "learning_rate": 0.00012797082008374967, | |
| "loss": 1.8008819580078126, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.22875, | |
| "grad_norm": 0.358188658952713, | |
| "learning_rate": 0.00012779609165206992, | |
| "loss": 1.8048545837402343, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.3641424775123596, | |
| "learning_rate": 0.000127620446729067, | |
| "loss": 1.8129388809204101, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.23125, | |
| "grad_norm": 0.36388713121414185, | |
| "learning_rate": 0.00012744388816408926, | |
| "loss": 1.7981510162353516, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.2325, | |
| "grad_norm": 0.3411344587802887, | |
| "learning_rate": 0.00012726641882130642, | |
| "loss": 1.7846858978271485, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.23375, | |
| "grad_norm": 0.36635443568229675, | |
| "learning_rate": 0.00012708804157966297, | |
| "loss": 1.8334461212158204, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.235, | |
| "grad_norm": 0.3459226191043854, | |
| "learning_rate": 0.00012690875933283154, | |
| "loss": 1.7850067138671875, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.23625, | |
| "grad_norm": 0.3630014657974243, | |
| "learning_rate": 0.00012672857498916595, | |
| "loss": 1.8400045394897462, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.2375, | |
| "grad_norm": 0.3783304691314697, | |
| "learning_rate": 0.000126547491471654, | |
| "loss": 1.7719623565673828, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.23875, | |
| "grad_norm": 0.3790845572948456, | |
| "learning_rate": 0.0001263655117178701, | |
| "loss": 1.8144996643066407, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.35528555512428284, | |
| "learning_rate": 0.0001261826386799276, | |
| "loss": 1.797579002380371, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.24125, | |
| "grad_norm": 0.3462880253791809, | |
| "learning_rate": 0.00012599887532443088, | |
| "loss": 1.7669387817382813, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.2425, | |
| "grad_norm": 0.35499900579452515, | |
| "learning_rate": 0.00012581422463242716, | |
| "loss": 1.782514762878418, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.24375, | |
| "grad_norm": 0.35548484325408936, | |
| "learning_rate": 0.00012562868959935835, | |
| "loss": 1.7927711486816407, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.245, | |
| "grad_norm": 0.36208584904670715, | |
| "learning_rate": 0.00012544227323501222, | |
| "loss": 1.8539527893066405, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.24625, | |
| "grad_norm": 0.3629232347011566, | |
| "learning_rate": 0.0001252549785634738, | |
| "loss": 1.7535400390625, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.2475, | |
| "grad_norm": 0.33926820755004883, | |
| "learning_rate": 0.000125066808623076, | |
| "loss": 1.7788131713867188, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.24875, | |
| "grad_norm": 0.3651394546031952, | |
| "learning_rate": 0.00012487776646635072, | |
| "loss": 1.8248186111450195, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.35856956243515015, | |
| "learning_rate": 0.00012468785515997905, | |
| "loss": 1.7728294372558593, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.25125, | |
| "grad_norm": 0.36707815527915955, | |
| "learning_rate": 0.0001244970777847416, | |
| "loss": 1.797306442260742, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.2525, | |
| "grad_norm": 0.37768349051475525, | |
| "learning_rate": 0.00012430543743546853, | |
| "loss": 1.8138954162597656, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.25375, | |
| "grad_norm": 0.3719421625137329, | |
| "learning_rate": 0.00012411293722098938, | |
| "loss": 1.8046173095703124, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.255, | |
| "grad_norm": 0.35382720828056335, | |
| "learning_rate": 0.00012391958026408258, | |
| "loss": 1.765408706665039, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.25625, | |
| "grad_norm": 0.3717374801635742, | |
| "learning_rate": 0.00012372536970142481, | |
| "loss": 1.794291877746582, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.2575, | |
| "grad_norm": 0.37810182571411133, | |
| "learning_rate": 0.0001235303086835401, | |
| "loss": 1.7855905532836913, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.25875, | |
| "grad_norm": 0.34465938806533813, | |
| "learning_rate": 0.00012333440037474877, | |
| "loss": 1.7502609252929688, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.3537978529930115, | |
| "learning_rate": 0.0001231376479531161, | |
| "loss": 1.8433588027954102, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.26125, | |
| "grad_norm": 0.3481179475784302, | |
| "learning_rate": 0.00012294005461040066, | |
| "loss": 1.778417205810547, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.2625, | |
| "grad_norm": 0.36712074279785156, | |
| "learning_rate": 0.00012274162355200264, | |
| "loss": 1.8297000885009767, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.26375, | |
| "grad_norm": 0.36218199133872986, | |
| "learning_rate": 0.0001225423579969119, | |
| "loss": 1.8048271179199218, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.265, | |
| "grad_norm": 0.3427264988422394, | |
| "learning_rate": 0.00012234226117765565, | |
| "loss": 1.765831756591797, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.26625, | |
| "grad_norm": 0.35128286480903625, | |
| "learning_rate": 0.00012214133634024592, | |
| "loss": 1.8477115631103516, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.2675, | |
| "grad_norm": 0.36919906735420227, | |
| "learning_rate": 0.0001219395867441272, | |
| "loss": 1.7384143829345704, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.26875, | |
| "grad_norm": 0.37480294704437256, | |
| "learning_rate": 0.00012173701566212328, | |
| "loss": 1.776589584350586, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.3442743718624115, | |
| "learning_rate": 0.00012153362638038429, | |
| "loss": 1.7534845352172852, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.27125, | |
| "grad_norm": 0.3617842495441437, | |
| "learning_rate": 0.0001213294221983334, | |
| "loss": 1.8287986755371093, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.2725, | |
| "grad_norm": 0.3468424081802368, | |
| "learning_rate": 0.00012112440642861319, | |
| "loss": 1.7810518264770507, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.27375, | |
| "grad_norm": 0.36655351519584656, | |
| "learning_rate": 0.000120918582397032, | |
| "loss": 1.8189208984375, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.275, | |
| "grad_norm": 0.35723134875297546, | |
| "learning_rate": 0.00012071195344251006, | |
| "loss": 1.8201839447021484, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.27625, | |
| "grad_norm": 0.36652442812919617, | |
| "learning_rate": 0.00012050452291702508, | |
| "loss": 1.8076786041259765, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.2775, | |
| "grad_norm": 0.3568657338619232, | |
| "learning_rate": 0.00012029629418555812, | |
| "loss": 1.7748506546020508, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.27875, | |
| "grad_norm": 0.34934675693511963, | |
| "learning_rate": 0.00012008727062603888, | |
| "loss": 1.8173185348510743, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.34384509921073914, | |
| "learning_rate": 0.00011987745562929093, | |
| "loss": 1.7502407073974608, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.28125, | |
| "grad_norm": 0.3680790066719055, | |
| "learning_rate": 0.00011966685259897665, | |
| "loss": 1.741659927368164, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.2825, | |
| "grad_norm": 0.37108564376831055, | |
| "learning_rate": 0.00011945546495154214, | |
| "loss": 1.7894527435302734, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.28375, | |
| "grad_norm": 0.37491941452026367, | |
| "learning_rate": 0.00011924329611616168, | |
| "loss": 1.7868507385253907, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.285, | |
| "grad_norm": 0.3443116545677185, | |
| "learning_rate": 0.00011903034953468213, | |
| "loss": 1.7541233062744142, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.28625, | |
| "grad_norm": 0.3643540143966675, | |
| "learning_rate": 0.00011881662866156715, | |
| "loss": 1.8128959655761718, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.2875, | |
| "grad_norm": 0.35639819502830505, | |
| "learning_rate": 0.00011860213696384107, | |
| "loss": 1.7657649993896485, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.28875, | |
| "grad_norm": 0.36442187428474426, | |
| "learning_rate": 0.00011838687792103273, | |
| "loss": 1.792444610595703, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.36035555601119995, | |
| "learning_rate": 0.00011817085502511903, | |
| "loss": 1.7670486450195313, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.29125, | |
| "grad_norm": 0.3552349805831909, | |
| "learning_rate": 0.00011795407178046817, | |
| "loss": 1.8542526245117188, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.2925, | |
| "grad_norm": 0.3693036437034607, | |
| "learning_rate": 0.00011773653170378296, | |
| "loss": 1.6886547088623047, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.29375, | |
| "grad_norm": 0.3605458736419678, | |
| "learning_rate": 0.00011751823832404365, | |
| "loss": 1.7754722595214845, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.295, | |
| "grad_norm": 0.35839903354644775, | |
| "learning_rate": 0.00011729919518245076, | |
| "loss": 1.7882440567016602, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.29625, | |
| "grad_norm": 0.36839786171913147, | |
| "learning_rate": 0.00011707940583236761, | |
| "loss": 1.7781326293945312, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.2975, | |
| "grad_norm": 0.35868513584136963, | |
| "learning_rate": 0.0001168588738392626, | |
| "loss": 1.7871665954589844, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.29875, | |
| "grad_norm": 0.3435186743736267, | |
| "learning_rate": 0.00011663760278065153, | |
| "loss": 1.8193252563476563, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.3949030935764313, | |
| "learning_rate": 0.00011641559624603941, | |
| "loss": 1.7928247451782227, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.30125, | |
| "grad_norm": 0.3681996762752533, | |
| "learning_rate": 0.00011619285783686234, | |
| "loss": 1.7616628646850585, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.3025, | |
| "grad_norm": 0.3694431781768799, | |
| "learning_rate": 0.00011596939116642899, | |
| "loss": 1.8024406433105469, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.30375, | |
| "grad_norm": 0.3637784719467163, | |
| "learning_rate": 0.00011574519985986208, | |
| "loss": 1.757676887512207, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.305, | |
| "grad_norm": 0.3616812229156494, | |
| "learning_rate": 0.00011552028755403952, | |
| "loss": 1.79559326171875, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.30625, | |
| "grad_norm": 0.36502957344055176, | |
| "learning_rate": 0.00011529465789753538, | |
| "loss": 1.7899351119995117, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.3075, | |
| "grad_norm": 0.3788166344165802, | |
| "learning_rate": 0.00011506831455056079, | |
| "loss": 1.8282848358154298, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.30875, | |
| "grad_norm": 0.36333489418029785, | |
| "learning_rate": 0.00011484126118490451, | |
| "loss": 1.766189956665039, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.35034865140914917, | |
| "learning_rate": 0.00011461350148387332, | |
| "loss": 1.7669204711914062, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.31125, | |
| "grad_norm": 0.35153037309646606, | |
| "learning_rate": 0.00011438503914223241, | |
| "loss": 1.7271625518798828, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.3125, | |
| "grad_norm": 0.3732260763645172, | |
| "learning_rate": 0.00011415587786614524, | |
| "loss": 1.7690876007080079, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.31375, | |
| "grad_norm": 0.3613711893558502, | |
| "learning_rate": 0.0001139260213731136, | |
| "loss": 1.7684833526611328, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.315, | |
| "grad_norm": 0.35713133215904236, | |
| "learning_rate": 0.00011369547339191726, | |
| "loss": 1.7643346786499023, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.31625, | |
| "grad_norm": 0.35974639654159546, | |
| "learning_rate": 0.0001134642376625534, | |
| "loss": 1.7887260437011718, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.3175, | |
| "grad_norm": 0.36356088519096375, | |
| "learning_rate": 0.00011323231793617599, | |
| "loss": 1.788846206665039, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.31875, | |
| "grad_norm": 0.3578101098537445, | |
| "learning_rate": 0.00011299971797503495, | |
| "loss": 1.781305694580078, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.35546955466270447, | |
| "learning_rate": 0.00011276644155241517, | |
| "loss": 1.7678417205810546, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.32125, | |
| "grad_norm": 0.3539295792579651, | |
| "learning_rate": 0.00011253249245257516, | |
| "loss": 1.7507053375244142, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.3225, | |
| "grad_norm": 0.35056355595588684, | |
| "learning_rate": 0.00011229787447068576, | |
| "loss": 1.8345399856567384, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.32375, | |
| "grad_norm": 0.3503001034259796, | |
| "learning_rate": 0.00011206259141276858, | |
| "loss": 1.8280166625976562, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.325, | |
| "grad_norm": 0.3602514863014221, | |
| "learning_rate": 0.0001118266470956342, | |
| "loss": 1.7046276092529298, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.32625, | |
| "grad_norm": 0.3672384023666382, | |
| "learning_rate": 0.00011159004534682027, | |
| "loss": 1.805099868774414, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.3275, | |
| "grad_norm": 0.3589872419834137, | |
| "learning_rate": 0.00011135279000452953, | |
| "loss": 1.7550365447998046, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.32875, | |
| "grad_norm": 0.3497745990753174, | |
| "learning_rate": 0.00011111488491756732, | |
| "loss": 1.758819580078125, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.3647236227989197, | |
| "learning_rate": 0.00011087633394527935, | |
| "loss": 1.765294647216797, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.33125, | |
| "grad_norm": 0.33403027057647705, | |
| "learning_rate": 0.00011063714095748899, | |
| "loss": 1.7979480743408203, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.3325, | |
| "grad_norm": 0.3792349696159363, | |
| "learning_rate": 0.00011039730983443455, | |
| "loss": 1.829258346557617, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.33375, | |
| "grad_norm": 0.3754643201828003, | |
| "learning_rate": 0.00011015684446670626, | |
| "loss": 1.783727264404297, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.335, | |
| "grad_norm": 0.3466981053352356, | |
| "learning_rate": 0.00010991574875518323, | |
| "loss": 1.7687664031982422, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.33625, | |
| "grad_norm": 0.3535688519477844, | |
| "learning_rate": 0.00010967402661097012, | |
| "loss": 1.8189085006713868, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.3375, | |
| "grad_norm": 0.36101067066192627, | |
| "learning_rate": 0.0001094316819553337, | |
| "loss": 1.752197265625, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.33875, | |
| "grad_norm": 0.36568474769592285, | |
| "learning_rate": 0.0001091887187196393, | |
| "loss": 1.7754268646240234, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.3312813639640808, | |
| "learning_rate": 0.00010894514084528695, | |
| "loss": 1.75748291015625, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.34125, | |
| "grad_norm": 0.3573434054851532, | |
| "learning_rate": 0.00010870095228364743, | |
| "loss": 1.7631900787353516, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.3425, | |
| "grad_norm": 0.35645684599876404, | |
| "learning_rate": 0.00010845615699599832, | |
| "loss": 1.747064971923828, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.34375, | |
| "grad_norm": 0.3608238101005554, | |
| "learning_rate": 0.00010821075895345951, | |
| "loss": 1.772369384765625, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.345, | |
| "grad_norm": 0.37147653102874756, | |
| "learning_rate": 0.00010796476213692903, | |
| "loss": 1.8682558059692382, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.34625, | |
| "grad_norm": 0.3562459349632263, | |
| "learning_rate": 0.0001077181705370183, | |
| "loss": 1.7756576538085938, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.3475, | |
| "grad_norm": 0.3861102759838104, | |
| "learning_rate": 0.00010747098815398739, | |
| "loss": 1.797110366821289, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.34875, | |
| "grad_norm": 0.3438943326473236, | |
| "learning_rate": 0.0001072232189976802, | |
| "loss": 1.7463438034057617, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.3862653374671936, | |
| "learning_rate": 0.00010697486708745942, | |
| "loss": 1.781214141845703, | |
| "step": 2800 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 8000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 9223372036854775807, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.750651595063296e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |