| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 20, | |
| "global_step": 8786, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0022765430693491933, | |
| "grad_norm": 0.469247430562973, | |
| "learning_rate": 0.0002, | |
| "loss": 1.9469, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.004553086138698387, | |
| "grad_norm": 0.6239348649978638, | |
| "learning_rate": 0.0002, | |
| "loss": 1.556, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.006829629208047579, | |
| "grad_norm": 0.4587397277355194, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4108, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.009106172277396773, | |
| "grad_norm": 0.42919760942459106, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3352, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.011382715346745967, | |
| "grad_norm": 0.46492573618888855, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3388, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.013659258416095159, | |
| "grad_norm": 0.453070729970932, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2295, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.015935801485444354, | |
| "grad_norm": 0.4760678708553314, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2493, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.018212344554793546, | |
| "grad_norm": 0.4545675814151764, | |
| "learning_rate": 0.0002, | |
| "loss": 1.215, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.020488887624142738, | |
| "grad_norm": 0.4772235155105591, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2173, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.022765430693491934, | |
| "grad_norm": 0.4403541088104248, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1058, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.025041973762841126, | |
| "grad_norm": 0.511401355266571, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1049, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.027318516832190318, | |
| "grad_norm": 0.3809013366699219, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0498, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.029595059901539513, | |
| "grad_norm": 0.3980010449886322, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9842, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.03187160297088871, | |
| "grad_norm": 0.5747793316841125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0988, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.0341481460402379, | |
| "grad_norm": 0.46827971935272217, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0367, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.03642468910958709, | |
| "grad_norm": 0.4702209532260895, | |
| "learning_rate": 0.0002, | |
| "loss": 1.066, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.038701232178936285, | |
| "grad_norm": 0.5084996223449707, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0652, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.040977775248285477, | |
| "grad_norm": 0.3944012522697449, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9642, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.04325431831763467, | |
| "grad_norm": 0.40287718176841736, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9431, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.04553086138698387, | |
| "grad_norm": 0.4629077613353729, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9615, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.04780740445633306, | |
| "grad_norm": 0.44827452301979065, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9434, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.05008394752568225, | |
| "grad_norm": 0.41644710302352905, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9241, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.05236049059503144, | |
| "grad_norm": 0.4760611057281494, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8475, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.054637033664380635, | |
| "grad_norm": 0.45987364649772644, | |
| "learning_rate": 0.0002, | |
| "loss": 0.898, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.056913576733729834, | |
| "grad_norm": 0.4840068817138672, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9611, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.059190119803079026, | |
| "grad_norm": 0.40314286947250366, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8884, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.06146666287242822, | |
| "grad_norm": 0.5458106398582458, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8939, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.06374320594177742, | |
| "grad_norm": 0.5420896410942078, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8265, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.0660197490111266, | |
| "grad_norm": 0.5356529355049133, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8432, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.0682962920804758, | |
| "grad_norm": 0.5064826011657715, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8272, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.07057283514982499, | |
| "grad_norm": 0.4143005311489105, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7854, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.07284937821917419, | |
| "grad_norm": 0.3817225396633148, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8219, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.07512592128852338, | |
| "grad_norm": 0.5336936712265015, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7977, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.07740246435787257, | |
| "grad_norm": 0.5397001504898071, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8117, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.07967900742722177, | |
| "grad_norm": 0.4968530535697937, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7527, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.08195555049657095, | |
| "grad_norm": 0.4084935784339905, | |
| "learning_rate": 0.0002, | |
| "loss": 0.651, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.08423209356592015, | |
| "grad_norm": 0.48406732082366943, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7352, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.08650863663526934, | |
| "grad_norm": 0.5246301293373108, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7785, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.08878517970461854, | |
| "grad_norm": 0.5729619264602661, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7646, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.09106172277396773, | |
| "grad_norm": 0.5675190687179565, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7784, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.09333826584331692, | |
| "grad_norm": 0.4682878255844116, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7284, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.09561480891266612, | |
| "grad_norm": 0.5388545393943787, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6959, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.0978913519820153, | |
| "grad_norm": 0.48806509375572205, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7585, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.1001678950513645, | |
| "grad_norm": 0.4149261713027954, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6978, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.1024444381207137, | |
| "grad_norm": 0.4971105754375458, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7103, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.10472098119006289, | |
| "grad_norm": 0.5066735744476318, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6854, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.10699752425941209, | |
| "grad_norm": 0.4922661781311035, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6231, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.10927406732876127, | |
| "grad_norm": 0.5949555039405823, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6813, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.11155061039811047, | |
| "grad_norm": 0.581446647644043, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6174, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.11382715346745967, | |
| "grad_norm": 0.6152529716491699, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6405, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.11610369653680885, | |
| "grad_norm": 0.5986836552619934, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5776, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.11838023960615805, | |
| "grad_norm": 0.4255094528198242, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6576, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.12065678267550724, | |
| "grad_norm": 0.4563849866390228, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6647, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.12293332574485644, | |
| "grad_norm": 0.593227744102478, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6043, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.12520986881420562, | |
| "grad_norm": 0.47059598565101624, | |
| "learning_rate": 0.0002, | |
| "loss": 0.591, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.12748641188355483, | |
| "grad_norm": 0.5013225674629211, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5947, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.12976295495290402, | |
| "grad_norm": 0.46772757172584534, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6292, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.1320394980222532, | |
| "grad_norm": 0.5844313502311707, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6128, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.1343160410916024, | |
| "grad_norm": 0.5295489430427551, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6064, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.1365925841609516, | |
| "grad_norm": 0.4482004642486572, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5899, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.1388691272303008, | |
| "grad_norm": 0.6281692981719971, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6109, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.14114567029964997, | |
| "grad_norm": 0.4718242585659027, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5857, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.14342221336899919, | |
| "grad_norm": 0.5219341516494751, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5581, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.14569875643834837, | |
| "grad_norm": 0.47050580382347107, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6368, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.14797529950769756, | |
| "grad_norm": 0.5425338745117188, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5626, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.15025184257704677, | |
| "grad_norm": 0.4944934844970703, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5337, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.15252838564639595, | |
| "grad_norm": 0.5921599864959717, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5672, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.15480492871574514, | |
| "grad_norm": 0.4866751730442047, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5305, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.15708147178509432, | |
| "grad_norm": 0.62166827917099, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5737, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.15935801485444354, | |
| "grad_norm": 0.5006982684135437, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5542, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.16163455792379272, | |
| "grad_norm": 0.6090095043182373, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5215, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.1639111009931419, | |
| "grad_norm": 0.4260309636592865, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5535, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.16618764406249112, | |
| "grad_norm": 0.48657718300819397, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5441, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.1684641871318403, | |
| "grad_norm": 0.43275007605552673, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5161, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.1707407302011895, | |
| "grad_norm": 0.4225006699562073, | |
| "learning_rate": 0.0002, | |
| "loss": 0.512, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.17301727327053867, | |
| "grad_norm": 0.5176346302032471, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5384, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.1752938163398879, | |
| "grad_norm": 0.6492679715156555, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4981, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.17757035940923707, | |
| "grad_norm": 0.5511758327484131, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5289, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.17984690247858626, | |
| "grad_norm": 0.5211341977119446, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5002, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.18212344554793547, | |
| "grad_norm": 0.5488260984420776, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5178, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.18439998861728465, | |
| "grad_norm": 0.6779264211654663, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5155, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.18667653168663384, | |
| "grad_norm": 0.502919614315033, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4923, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.18895307475598305, | |
| "grad_norm": 0.4989205300807953, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4825, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.19122961782533224, | |
| "grad_norm": 0.5155315399169922, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4796, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.19350616089468142, | |
| "grad_norm": 0.5648865699768066, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4985, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.1957827039640306, | |
| "grad_norm": 0.606176495552063, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4819, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.19805924703337982, | |
| "grad_norm": 0.5440786480903625, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5213, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.200335790102729, | |
| "grad_norm": 0.43152502179145813, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4429, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.2026123331720782, | |
| "grad_norm": 0.5701313614845276, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4486, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.2048888762414274, | |
| "grad_norm": 0.565666913986206, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4561, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.2071654193107766, | |
| "grad_norm": 0.5725598931312561, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4757, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.20944196238012577, | |
| "grad_norm": 0.4642520248889923, | |
| "learning_rate": 0.0002, | |
| "loss": 0.438, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.21171850544947496, | |
| "grad_norm": 0.6077229976654053, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4295, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.21399504851882417, | |
| "grad_norm": 0.6314090490341187, | |
| "learning_rate": 0.0002, | |
| "loss": 0.449, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.21627159158817336, | |
| "grad_norm": 0.4416756331920624, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4554, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.21854813465752254, | |
| "grad_norm": 0.5278882384300232, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4554, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.22082467772687175, | |
| "grad_norm": 0.45619043707847595, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4868, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.22310122079622094, | |
| "grad_norm": 0.5881581902503967, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4672, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.22537776386557012, | |
| "grad_norm": 0.5379284024238586, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4531, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.22765430693491934, | |
| "grad_norm": 0.5562624931335449, | |
| "learning_rate": 0.0002, | |
| "loss": 0.464, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.22993085000426852, | |
| "grad_norm": 0.554499626159668, | |
| "learning_rate": 0.0002, | |
| "loss": 0.446, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.2322073930736177, | |
| "grad_norm": 0.509219229221344, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4417, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.2344839361429669, | |
| "grad_norm": 0.5206849575042725, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4118, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.2367604792123161, | |
| "grad_norm": 0.548729658126831, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4067, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.2390370222816653, | |
| "grad_norm": 0.4220084846019745, | |
| "learning_rate": 0.0002, | |
| "loss": 0.428, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.24131356535101448, | |
| "grad_norm": 0.5507292747497559, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4176, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.2435901084203637, | |
| "grad_norm": 0.5605701208114624, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4661, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.24586665148971287, | |
| "grad_norm": 0.43142881989479065, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4197, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.24814319455906206, | |
| "grad_norm": 0.47790080308914185, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4568, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.25041973762841124, | |
| "grad_norm": 0.6048968434333801, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4199, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.25269628069776046, | |
| "grad_norm": 0.4925907850265503, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4325, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.25497282376710967, | |
| "grad_norm": 0.5463051199913025, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4549, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.2572493668364588, | |
| "grad_norm": 0.4631319046020508, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3977, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.25952590990580804, | |
| "grad_norm": 0.4965234398841858, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4285, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.2618024529751572, | |
| "grad_norm": 0.5436238646507263, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4039, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.2640789960445064, | |
| "grad_norm": 0.5218191742897034, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4092, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.2663555391138556, | |
| "grad_norm": 0.5417261719703674, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3825, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.2686320821832048, | |
| "grad_norm": 0.6126281023025513, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4391, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.270908625252554, | |
| "grad_norm": 0.4734433889389038, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4151, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.2731851683219032, | |
| "grad_norm": 0.4501429796218872, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4178, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.27546171139125236, | |
| "grad_norm": 0.5258509516716003, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4007, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.2777382544606016, | |
| "grad_norm": 0.47874951362609863, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4245, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.2800147975299508, | |
| "grad_norm": 0.528533399105072, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3794, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.28229134059929994, | |
| "grad_norm": 0.46465063095092773, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4019, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.28456788366864916, | |
| "grad_norm": 0.5217177867889404, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4104, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.28684442673799837, | |
| "grad_norm": 0.510036289691925, | |
| "learning_rate": 0.0002, | |
| "loss": 0.389, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.2891209698073475, | |
| "grad_norm": 0.6968228220939636, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4152, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.29139751287669674, | |
| "grad_norm": 0.4529867470264435, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3987, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.29367405594604595, | |
| "grad_norm": 0.5680263638496399, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3828, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.2959505990153951, | |
| "grad_norm": 0.4892405867576599, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4006, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.2982271420847443, | |
| "grad_norm": 0.47588276863098145, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4197, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.30050368515409354, | |
| "grad_norm": 0.5624070167541504, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3997, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.3027802282234427, | |
| "grad_norm": 0.5434039831161499, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3977, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.3050567712927919, | |
| "grad_norm": 0.5572277903556824, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3966, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.30733331436214106, | |
| "grad_norm": 0.5533374547958374, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3803, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.3096098574314903, | |
| "grad_norm": 0.40596967935562134, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3682, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.3118864005008395, | |
| "grad_norm": 0.4737823009490967, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3761, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.31416294357018865, | |
| "grad_norm": 0.4295174777507782, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4035, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.31643948663953786, | |
| "grad_norm": 0.5348454713821411, | |
| "learning_rate": 0.0002, | |
| "loss": 0.404, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.31871602970888707, | |
| "grad_norm": 0.4819965362548828, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3929, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.32099257277823623, | |
| "grad_norm": 0.5920088291168213, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3798, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.32326911584758544, | |
| "grad_norm": 0.4936531186103821, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3995, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.32554565891693465, | |
| "grad_norm": 0.5252315998077393, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3842, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.3278222019862838, | |
| "grad_norm": 0.5818414688110352, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3533, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.330098745055633, | |
| "grad_norm": 0.44053876399993896, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3402, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.33237528812498224, | |
| "grad_norm": 0.5421345233917236, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3542, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.3346518311943314, | |
| "grad_norm": 0.4642751216888428, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3755, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.3369283742636806, | |
| "grad_norm": 0.5137833952903748, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3602, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.3392049173330298, | |
| "grad_norm": 0.5032792687416077, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3451, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.341481460402379, | |
| "grad_norm": 0.4932720363140106, | |
| "learning_rate": 0.0002, | |
| "loss": 0.384, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.3437580034717282, | |
| "grad_norm": 0.49986231327056885, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3826, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.34603454654107735, | |
| "grad_norm": 0.6325618624687195, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3582, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.34831108961042656, | |
| "grad_norm": 0.5402369499206543, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3706, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.3505876326797758, | |
| "grad_norm": 0.4967012107372284, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3456, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.35286417574912493, | |
| "grad_norm": 0.4491735100746155, | |
| "learning_rate": 0.0002, | |
| "loss": 0.347, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.35514071881847414, | |
| "grad_norm": 0.9062516093254089, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3617, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.35741726188782336, | |
| "grad_norm": 0.5253359079360962, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3512, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.3596938049571725, | |
| "grad_norm": 0.4836867153644562, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3585, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.3619703480265217, | |
| "grad_norm": 0.49537473917007446, | |
| "learning_rate": 0.0002, | |
| "loss": 0.364, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.36424689109587094, | |
| "grad_norm": 0.6098095178604126, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3455, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.3665234341652201, | |
| "grad_norm": 0.5926884412765503, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3406, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.3687999772345693, | |
| "grad_norm": 0.5868669152259827, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3643, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.3710765203039185, | |
| "grad_norm": 0.42670106887817383, | |
| "learning_rate": 0.0002, | |
| "loss": 0.344, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.3733530633732677, | |
| "grad_norm": 0.5992838740348816, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3588, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.3756296064426169, | |
| "grad_norm": 0.4388341009616852, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3375, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.3779061495119661, | |
| "grad_norm": 0.596488893032074, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3425, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.38018269258131526, | |
| "grad_norm": 0.4572538137435913, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3711, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.3824592356506645, | |
| "grad_norm": 0.5661656856536865, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3415, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.38473577872001363, | |
| "grad_norm": 0.45082923769950867, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3495, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.38701232178936285, | |
| "grad_norm": 0.4995211660861969, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3311, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.38928886485871206, | |
| "grad_norm": 0.5004004240036011, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3506, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.3915654079280612, | |
| "grad_norm": 0.5676460266113281, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3383, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.39384195099741043, | |
| "grad_norm": 0.4805515706539154, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3382, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.39611849406675964, | |
| "grad_norm": 0.47675764560699463, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3021, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.3983950371361088, | |
| "grad_norm": 0.6285260915756226, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3467, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.400671580205458, | |
| "grad_norm": 0.5657575130462646, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3382, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.4029481232748072, | |
| "grad_norm": 0.6148316860198975, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3396, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.4052246663441564, | |
| "grad_norm": 0.5819992423057556, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3373, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.4075012094135056, | |
| "grad_norm": 0.6080338954925537, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3463, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.4097777524828548, | |
| "grad_norm": 0.6103864312171936, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3441, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.41205429555220396, | |
| "grad_norm": 0.5234800577163696, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3272, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.4143308386215532, | |
| "grad_norm": 0.5393822193145752, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3308, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.4166073816909024, | |
| "grad_norm": 0.4853431284427643, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3152, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 0.41888392476025155, | |
| "grad_norm": 0.5507264733314514, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3229, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.42116046782960076, | |
| "grad_norm": 0.44306129217147827, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3389, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.4234370108989499, | |
| "grad_norm": 0.4574294984340668, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3516, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 0.42571355396829913, | |
| "grad_norm": 0.5367994904518127, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3576, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 0.42799009703764834, | |
| "grad_norm": 0.5044491291046143, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3449, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 0.4302666401069975, | |
| "grad_norm": 0.41715556383132935, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3128, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 0.4325431831763467, | |
| "grad_norm": 0.4355817437171936, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3131, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.4348197262456959, | |
| "grad_norm": 0.5237382650375366, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3281, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 0.4370962693150451, | |
| "grad_norm": 0.6210081577301025, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3195, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.4393728123843943, | |
| "grad_norm": 0.5145352482795715, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3107, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 0.4416493554537435, | |
| "grad_norm": 0.5554608106613159, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3418, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 0.44392589852309267, | |
| "grad_norm": 0.4971628487110138, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3293, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.4462024415924419, | |
| "grad_norm": 0.49732130765914917, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3138, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 0.4484789846617911, | |
| "grad_norm": 0.5883257985115051, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3357, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 0.45075552773114025, | |
| "grad_norm": 0.5349528193473816, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3381, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 0.45303207080048946, | |
| "grad_norm": 0.5360047221183777, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3116, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 0.4553086138698387, | |
| "grad_norm": 0.4889732003211975, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3154, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.45758515693918783, | |
| "grad_norm": 0.4912421703338623, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3054, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 0.45986170000853704, | |
| "grad_norm": 0.4449983835220337, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3079, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 0.46213824307788626, | |
| "grad_norm": 0.4488675892353058, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3027, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 0.4644147861472354, | |
| "grad_norm": 0.5412561893463135, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2932, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 0.4666913292165846, | |
| "grad_norm": 0.41218650341033936, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3087, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.4689678722859338, | |
| "grad_norm": 0.5233949422836304, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3157, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 0.471244415355283, | |
| "grad_norm": 0.5676075220108032, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3267, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 0.4735209584246322, | |
| "grad_norm": 0.5336834788322449, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3185, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 0.47579750149398137, | |
| "grad_norm": 0.5505925416946411, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3116, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 0.4780740445633306, | |
| "grad_norm": 0.5440223813056946, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3234, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.4803505876326798, | |
| "grad_norm": 0.46334293484687805, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3209, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 0.48262713070202895, | |
| "grad_norm": 0.452364444732666, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3056, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 0.48490367377137816, | |
| "grad_norm": 0.5037956833839417, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3141, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 0.4871802168407274, | |
| "grad_norm": 0.4308939278125763, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2948, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 0.48945675991007653, | |
| "grad_norm": 0.45019960403442383, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3142, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.49173330297942575, | |
| "grad_norm": 0.4351404011249542, | |
| "learning_rate": 0.0002, | |
| "loss": 0.31, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 0.49400984604877496, | |
| "grad_norm": 0.38306841254234314, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2889, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 0.4962863891181241, | |
| "grad_norm": 0.545360803604126, | |
| "learning_rate": 0.0002, | |
| "loss": 0.311, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 0.49856293218747333, | |
| "grad_norm": 0.44942232966423035, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2899, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 0.5008394752568225, | |
| "grad_norm": 0.46564239263534546, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3013, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.5031160183261717, | |
| "grad_norm": 0.5398554801940918, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3104, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 0.5053925613955209, | |
| "grad_norm": 0.47367504239082336, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2945, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 0.5076691044648701, | |
| "grad_norm": 0.45659711956977844, | |
| "learning_rate": 0.0002, | |
| "loss": 0.304, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 0.5099456475342193, | |
| "grad_norm": 0.4942033290863037, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2969, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 0.5122221906035684, | |
| "grad_norm": 0.46578243374824524, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2935, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.5144987336729177, | |
| "grad_norm": 0.6523891687393188, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2823, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 0.5167752767422669, | |
| "grad_norm": 0.4787238538265228, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3148, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 0.5190518198116161, | |
| "grad_norm": 0.46825891733169556, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3089, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 0.5213283628809653, | |
| "grad_norm": 0.46605536341667175, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3012, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 0.5236049059503144, | |
| "grad_norm": 0.5826888680458069, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3043, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.5258814490196636, | |
| "grad_norm": 0.48641151189804077, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2952, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 0.5281579920890128, | |
| "grad_norm": 0.5396175384521484, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2926, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 0.530434535158362, | |
| "grad_norm": 0.5584241151809692, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3048, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 0.5327110782277112, | |
| "grad_norm": 0.5832685232162476, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2948, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 0.5349876212970605, | |
| "grad_norm": 0.4676337242126465, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3043, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.5372641643664096, | |
| "grad_norm": 0.4440428614616394, | |
| "learning_rate": 0.0002, | |
| "loss": 0.288, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 0.5395407074357588, | |
| "grad_norm": 0.49934279918670654, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2882, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 0.541817250505108, | |
| "grad_norm": 0.5172054171562195, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3225, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 0.5440937935744572, | |
| "grad_norm": 0.4527619183063507, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2869, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 0.5463703366438064, | |
| "grad_norm": 0.548918604850769, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3105, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.5486468797131556, | |
| "grad_norm": 0.48801419138908386, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2835, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 0.5509234227825047, | |
| "grad_norm": 0.49810609221458435, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3227, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 0.5531999658518539, | |
| "grad_norm": 0.49763086438179016, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2786, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 0.5554765089212031, | |
| "grad_norm": 0.48815059661865234, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2802, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 0.5577530519905524, | |
| "grad_norm": 0.3571115732192993, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2796, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.5600295950599016, | |
| "grad_norm": 0.6448425650596619, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2844, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 0.5623061381292508, | |
| "grad_norm": 0.49660468101501465, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2892, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 0.5645826811985999, | |
| "grad_norm": 0.47702720761299133, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3111, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 0.5668592242679491, | |
| "grad_norm": 0.5281921029090881, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2908, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 0.5691357673372983, | |
| "grad_norm": 0.6427987813949585, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2848, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.5714123104066475, | |
| "grad_norm": 0.5437233448028564, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3023, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 0.5736888534759967, | |
| "grad_norm": 0.517444372177124, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2876, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 0.5759653965453458, | |
| "grad_norm": 0.5197298526763916, | |
| "learning_rate": 0.0002, | |
| "loss": 0.304, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 0.578241939614695, | |
| "grad_norm": 0.3452152907848358, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2794, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 0.5805184826840443, | |
| "grad_norm": 0.5630306601524353, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2979, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.5827950257533935, | |
| "grad_norm": 0.5696737170219421, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3035, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 0.5850715688227427, | |
| "grad_norm": 0.5024551153182983, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2717, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 0.5873481118920919, | |
| "grad_norm": 0.4166383147239685, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3065, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 0.589624654961441, | |
| "grad_norm": 0.36780408024787903, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2864, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 0.5919011980307902, | |
| "grad_norm": 0.436526894569397, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2764, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.5941777411001394, | |
| "grad_norm": 0.43115249276161194, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2791, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 0.5964542841694886, | |
| "grad_norm": 0.359739750623703, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3108, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 0.5987308272388379, | |
| "grad_norm": 0.4555259644985199, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2623, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 0.6010073703081871, | |
| "grad_norm": 0.4587076008319855, | |
| "learning_rate": 0.0002, | |
| "loss": 0.293, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 0.6032839133775362, | |
| "grad_norm": 0.5236973166465759, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2888, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.6055604564468854, | |
| "grad_norm": 0.46685513854026794, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2731, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 0.6078369995162346, | |
| "grad_norm": 0.5701884627342224, | |
| "learning_rate": 0.0002, | |
| "loss": 0.28, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 0.6101135425855838, | |
| "grad_norm": 0.5002717971801758, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2777, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 0.612390085654933, | |
| "grad_norm": 0.5896885395050049, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3048, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 0.6146666287242821, | |
| "grad_norm": 0.49014943838119507, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2642, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.6169431717936313, | |
| "grad_norm": 0.5924846529960632, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2943, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 0.6192197148629806, | |
| "grad_norm": 0.49827829003334045, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2879, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 0.6214962579323298, | |
| "grad_norm": 0.45312178134918213, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2728, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 0.623772801001679, | |
| "grad_norm": 0.3595191538333893, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2713, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 0.6260493440710282, | |
| "grad_norm": 0.6547619104385376, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2855, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.6283258871403773, | |
| "grad_norm": 0.4659534692764282, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2908, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 0.6306024302097265, | |
| "grad_norm": 0.4027460813522339, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2651, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 0.6328789732790757, | |
| "grad_norm": 0.36129653453826904, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2915, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 0.6351555163484249, | |
| "grad_norm": 0.5963912010192871, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2968, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 0.6374320594177741, | |
| "grad_norm": 0.49669450521469116, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2965, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.6397086024871234, | |
| "grad_norm": 0.5784302353858948, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2626, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 0.6419851455564725, | |
| "grad_norm": 0.5651645660400391, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2738, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 0.6442616886258217, | |
| "grad_norm": 0.45475292205810547, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2653, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 0.6465382316951709, | |
| "grad_norm": 0.4691898822784424, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2634, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 0.6488147747645201, | |
| "grad_norm": 0.4604431092739105, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2838, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.6510913178338693, | |
| "grad_norm": 0.506804883480072, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2657, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 0.6533678609032184, | |
| "grad_norm": 0.5051881670951843, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2976, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 0.6556444039725676, | |
| "grad_norm": 0.4780672788619995, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2828, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 0.6579209470419168, | |
| "grad_norm": 0.4695095121860504, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2685, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 0.660197490111266, | |
| "grad_norm": 0.4259052276611328, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2635, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.6624740331806153, | |
| "grad_norm": 0.5684182643890381, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2879, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 0.6647505762499645, | |
| "grad_norm": 0.42193594574928284, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2678, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 0.6670271193193136, | |
| "grad_norm": 0.5095034241676331, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2677, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 0.6693036623886628, | |
| "grad_norm": 0.46626052260398865, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2906, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 0.671580205458012, | |
| "grad_norm": 0.5086765289306641, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2775, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.6738567485273612, | |
| "grad_norm": 0.44444966316223145, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2764, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 0.6761332915967104, | |
| "grad_norm": 0.4477381706237793, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2729, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 0.6784098346660596, | |
| "grad_norm": 0.46984028816223145, | |
| "learning_rate": 0.0002, | |
| "loss": 0.273, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 0.6806863777354087, | |
| "grad_norm": 0.417084276676178, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2744, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 0.682962920804758, | |
| "grad_norm": 0.4144213795661926, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2704, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.6852394638741072, | |
| "grad_norm": 0.5844799876213074, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2635, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 0.6875160069434564, | |
| "grad_norm": 0.39512693881988525, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2471, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 0.6897925500128056, | |
| "grad_norm": 0.5299990773200989, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2648, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 0.6920690930821547, | |
| "grad_norm": 0.4980265498161316, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2725, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 0.6943456361515039, | |
| "grad_norm": 0.4003869891166687, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2768, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.6966221792208531, | |
| "grad_norm": 0.5103460550308228, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2638, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 0.6988987222902023, | |
| "grad_norm": 0.737101137638092, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2779, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 0.7011752653595515, | |
| "grad_norm": 0.4731826186180115, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2691, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 0.7034518084289008, | |
| "grad_norm": 0.5234053730964661, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2739, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 0.7057283514982499, | |
| "grad_norm": 0.5235525369644165, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2754, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.7080048945675991, | |
| "grad_norm": 0.4453619122505188, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2833, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 0.7102814376369483, | |
| "grad_norm": 0.4025666117668152, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2713, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 0.7125579807062975, | |
| "grad_norm": 0.35240331292152405, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2786, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 0.7148345237756467, | |
| "grad_norm": 0.4521905779838562, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2639, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 0.7171110668449959, | |
| "grad_norm": 0.5230519771575928, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2517, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.719387609914345, | |
| "grad_norm": 0.5415637493133545, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2739, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 0.7216641529836942, | |
| "grad_norm": 0.4067966341972351, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2751, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 0.7239406960530435, | |
| "grad_norm": 0.4670214354991913, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2644, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 0.7262172391223927, | |
| "grad_norm": 0.5316203236579895, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2746, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 0.7284937821917419, | |
| "grad_norm": 0.46312493085861206, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2539, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.730770325261091, | |
| "grad_norm": 0.465279221534729, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2742, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 0.7330468683304402, | |
| "grad_norm": 0.5096962451934814, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2546, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 0.7353234113997894, | |
| "grad_norm": 0.4525590240955353, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2694, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 0.7375999544691386, | |
| "grad_norm": 0.5033881664276123, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2627, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 0.7398764975384878, | |
| "grad_norm": 0.44053900241851807, | |
| "learning_rate": 0.0002, | |
| "loss": 0.258, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.742153040607837, | |
| "grad_norm": 0.4677462875843048, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2659, | |
| "step": 6520 | |
| }, | |
| { | |
| "epoch": 0.7444295836771861, | |
| "grad_norm": 0.5687553882598877, | |
| "learning_rate": 0.0002, | |
| "loss": 0.271, | |
| "step": 6540 | |
| }, | |
| { | |
| "epoch": 0.7467061267465354, | |
| "grad_norm": 0.4980468451976776, | |
| "learning_rate": 0.0002, | |
| "loss": 0.265, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 0.7489826698158846, | |
| "grad_norm": 0.5155619382858276, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2491, | |
| "step": 6580 | |
| }, | |
| { | |
| "epoch": 0.7512592128852338, | |
| "grad_norm": 0.5364673733711243, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2564, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.753535755954583, | |
| "grad_norm": 0.421838641166687, | |
| "learning_rate": 0.0002, | |
| "loss": 0.267, | |
| "step": 6620 | |
| }, | |
| { | |
| "epoch": 0.7558122990239322, | |
| "grad_norm": 0.46299833059310913, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2461, | |
| "step": 6640 | |
| }, | |
| { | |
| "epoch": 0.7580888420932813, | |
| "grad_norm": 0.3832832872867584, | |
| "learning_rate": 0.0002, | |
| "loss": 0.265, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 0.7603653851626305, | |
| "grad_norm": 0.5560947060585022, | |
| "learning_rate": 0.0002, | |
| "loss": 0.253, | |
| "step": 6680 | |
| }, | |
| { | |
| "epoch": 0.7626419282319797, | |
| "grad_norm": 0.4832628667354584, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2515, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.764918471301329, | |
| "grad_norm": 0.44354599714279175, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2687, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 0.7671950143706782, | |
| "grad_norm": 0.3746070861816406, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2481, | |
| "step": 6740 | |
| }, | |
| { | |
| "epoch": 0.7694715574400273, | |
| "grad_norm": 0.3048388659954071, | |
| "learning_rate": 0.0002, | |
| "loss": 0.269, | |
| "step": 6760 | |
| }, | |
| { | |
| "epoch": 0.7717481005093765, | |
| "grad_norm": 0.46471843123435974, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2642, | |
| "step": 6780 | |
| }, | |
| { | |
| "epoch": 0.7740246435787257, | |
| "grad_norm": 0.44309428334236145, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2565, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.7763011866480749, | |
| "grad_norm": 0.4174291789531708, | |
| "learning_rate": 0.0002, | |
| "loss": 0.262, | |
| "step": 6820 | |
| }, | |
| { | |
| "epoch": 0.7785777297174241, | |
| "grad_norm": 0.42592549324035645, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2608, | |
| "step": 6840 | |
| }, | |
| { | |
| "epoch": 0.7808542727867733, | |
| "grad_norm": 0.4378054141998291, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2765, | |
| "step": 6860 | |
| }, | |
| { | |
| "epoch": 0.7831308158561224, | |
| "grad_norm": 0.4560708701610565, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2381, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 0.7854073589254716, | |
| "grad_norm": 0.4595545828342438, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2561, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.7876839019948209, | |
| "grad_norm": 0.45213592052459717, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2645, | |
| "step": 6920 | |
| }, | |
| { | |
| "epoch": 0.7899604450641701, | |
| "grad_norm": 0.4857342839241028, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2687, | |
| "step": 6940 | |
| }, | |
| { | |
| "epoch": 0.7922369881335193, | |
| "grad_norm": 0.4939437508583069, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2642, | |
| "step": 6960 | |
| }, | |
| { | |
| "epoch": 0.7945135312028685, | |
| "grad_norm": 0.46244382858276367, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2536, | |
| "step": 6980 | |
| }, | |
| { | |
| "epoch": 0.7967900742722176, | |
| "grad_norm": 0.5876993536949158, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2492, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.7990666173415668, | |
| "grad_norm": 0.5170072913169861, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2548, | |
| "step": 7020 | |
| }, | |
| { | |
| "epoch": 0.801343160410916, | |
| "grad_norm": 0.394380658864975, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2524, | |
| "step": 7040 | |
| }, | |
| { | |
| "epoch": 0.8036197034802652, | |
| "grad_norm": 0.4716455340385437, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2573, | |
| "step": 7060 | |
| }, | |
| { | |
| "epoch": 0.8058962465496144, | |
| "grad_norm": 0.34525179862976074, | |
| "learning_rate": 0.0002, | |
| "loss": 0.246, | |
| "step": 7080 | |
| }, | |
| { | |
| "epoch": 0.8081727896189635, | |
| "grad_norm": 0.5030418038368225, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2596, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.8104493326883128, | |
| "grad_norm": 0.5586132407188416, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2568, | |
| "step": 7120 | |
| }, | |
| { | |
| "epoch": 0.812725875757662, | |
| "grad_norm": 0.47025129199028015, | |
| "learning_rate": 0.0002, | |
| "loss": 0.265, | |
| "step": 7140 | |
| }, | |
| { | |
| "epoch": 0.8150024188270112, | |
| "grad_norm": 0.5654832720756531, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2468, | |
| "step": 7160 | |
| }, | |
| { | |
| "epoch": 0.8172789618963604, | |
| "grad_norm": 0.4701017141342163, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2538, | |
| "step": 7180 | |
| }, | |
| { | |
| "epoch": 0.8195555049657096, | |
| "grad_norm": 0.47270438075065613, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2529, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.8218320480350587, | |
| "grad_norm": 0.39433714747428894, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2445, | |
| "step": 7220 | |
| }, | |
| { | |
| "epoch": 0.8241085911044079, | |
| "grad_norm": 0.4521467685699463, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2556, | |
| "step": 7240 | |
| }, | |
| { | |
| "epoch": 0.8263851341737571, | |
| "grad_norm": 0.28483667969703674, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2451, | |
| "step": 7260 | |
| }, | |
| { | |
| "epoch": 0.8286616772431064, | |
| "grad_norm": 0.4298310875892639, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2599, | |
| "step": 7280 | |
| }, | |
| { | |
| "epoch": 0.8309382203124556, | |
| "grad_norm": 0.39677906036376953, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2539, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.8332147633818048, | |
| "grad_norm": 0.5800175666809082, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2463, | |
| "step": 7320 | |
| }, | |
| { | |
| "epoch": 0.8354913064511539, | |
| "grad_norm": 0.42742472887039185, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2593, | |
| "step": 7340 | |
| }, | |
| { | |
| "epoch": 0.8377678495205031, | |
| "grad_norm": 0.5521807670593262, | |
| "learning_rate": 0.0002, | |
| "loss": 0.253, | |
| "step": 7360 | |
| }, | |
| { | |
| "epoch": 0.8400443925898523, | |
| "grad_norm": 0.5068047046661377, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2503, | |
| "step": 7380 | |
| }, | |
| { | |
| "epoch": 0.8423209356592015, | |
| "grad_norm": 0.4325120151042938, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2466, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.8445974787285507, | |
| "grad_norm": 0.5130394101142883, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2521, | |
| "step": 7420 | |
| }, | |
| { | |
| "epoch": 0.8468740217978998, | |
| "grad_norm": 0.5091120600700378, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2429, | |
| "step": 7440 | |
| }, | |
| { | |
| "epoch": 0.849150564867249, | |
| "grad_norm": 0.4635036289691925, | |
| "learning_rate": 0.0002, | |
| "loss": 0.235, | |
| "step": 7460 | |
| }, | |
| { | |
| "epoch": 0.8514271079365983, | |
| "grad_norm": 0.3827108144760132, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2487, | |
| "step": 7480 | |
| }, | |
| { | |
| "epoch": 0.8537036510059475, | |
| "grad_norm": 0.3880899250507355, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2469, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.8559801940752967, | |
| "grad_norm": 0.408933162689209, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2499, | |
| "step": 7520 | |
| }, | |
| { | |
| "epoch": 0.8582567371446459, | |
| "grad_norm": 0.5049706101417542, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2418, | |
| "step": 7540 | |
| }, | |
| { | |
| "epoch": 0.860533280213995, | |
| "grad_norm": 0.43551701307296753, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2478, | |
| "step": 7560 | |
| }, | |
| { | |
| "epoch": 0.8628098232833442, | |
| "grad_norm": 0.5024411678314209, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2538, | |
| "step": 7580 | |
| }, | |
| { | |
| "epoch": 0.8650863663526934, | |
| "grad_norm": 0.36361223459243774, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2536, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.8673629094220426, | |
| "grad_norm": 0.4526277482509613, | |
| "learning_rate": 0.0002, | |
| "loss": 0.242, | |
| "step": 7620 | |
| }, | |
| { | |
| "epoch": 0.8696394524913919, | |
| "grad_norm": 0.5677676200866699, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2572, | |
| "step": 7640 | |
| }, | |
| { | |
| "epoch": 0.8719159955607411, | |
| "grad_norm": 0.4915711283683777, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2562, | |
| "step": 7660 | |
| }, | |
| { | |
| "epoch": 0.8741925386300902, | |
| "grad_norm": 0.36850452423095703, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2523, | |
| "step": 7680 | |
| }, | |
| { | |
| "epoch": 0.8764690816994394, | |
| "grad_norm": 0.38313761353492737, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2596, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.8787456247687886, | |
| "grad_norm": 0.5384640097618103, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2455, | |
| "step": 7720 | |
| }, | |
| { | |
| "epoch": 0.8810221678381378, | |
| "grad_norm": 0.5308900475502014, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2439, | |
| "step": 7740 | |
| }, | |
| { | |
| "epoch": 0.883298710907487, | |
| "grad_norm": 0.5488154292106628, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2428, | |
| "step": 7760 | |
| }, | |
| { | |
| "epoch": 0.8855752539768362, | |
| "grad_norm": 0.5271242260932922, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2372, | |
| "step": 7780 | |
| }, | |
| { | |
| "epoch": 0.8878517970461853, | |
| "grad_norm": 0.46171802282333374, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2506, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.8901283401155345, | |
| "grad_norm": 0.45436665415763855, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2414, | |
| "step": 7820 | |
| }, | |
| { | |
| "epoch": 0.8924048831848838, | |
| "grad_norm": 0.4920847415924072, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2669, | |
| "step": 7840 | |
| }, | |
| { | |
| "epoch": 0.894681426254233, | |
| "grad_norm": 0.5913518071174622, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2552, | |
| "step": 7860 | |
| }, | |
| { | |
| "epoch": 0.8969579693235822, | |
| "grad_norm": 0.6011972427368164, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2533, | |
| "step": 7880 | |
| }, | |
| { | |
| "epoch": 0.8992345123929313, | |
| "grad_norm": 0.4650927186012268, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2448, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.9015110554622805, | |
| "grad_norm": 0.5828790664672852, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2381, | |
| "step": 7920 | |
| }, | |
| { | |
| "epoch": 0.9037875985316297, | |
| "grad_norm": 0.5178338885307312, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2619, | |
| "step": 7940 | |
| }, | |
| { | |
| "epoch": 0.9060641416009789, | |
| "grad_norm": 0.5147708058357239, | |
| "learning_rate": 0.0002, | |
| "loss": 0.258, | |
| "step": 7960 | |
| }, | |
| { | |
| "epoch": 0.9083406846703281, | |
| "grad_norm": 0.45790836215019226, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2474, | |
| "step": 7980 | |
| }, | |
| { | |
| "epoch": 0.9106172277396773, | |
| "grad_norm": 0.3837074935436249, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2356, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.9128937708090265, | |
| "grad_norm": 0.4466090500354767, | |
| "learning_rate": 0.0002, | |
| "loss": 0.237, | |
| "step": 8020 | |
| }, | |
| { | |
| "epoch": 0.9151703138783757, | |
| "grad_norm": 0.5893344283103943, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2399, | |
| "step": 8040 | |
| }, | |
| { | |
| "epoch": 0.9174468569477249, | |
| "grad_norm": 0.49547362327575684, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2526, | |
| "step": 8060 | |
| }, | |
| { | |
| "epoch": 0.9197234000170741, | |
| "grad_norm": 0.47068551182746887, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2631, | |
| "step": 8080 | |
| }, | |
| { | |
| "epoch": 0.9219999430864233, | |
| "grad_norm": 0.3512951135635376, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2395, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.9242764861557725, | |
| "grad_norm": 0.3996793031692505, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2424, | |
| "step": 8120 | |
| }, | |
| { | |
| "epoch": 0.9265530292251216, | |
| "grad_norm": 0.5782022476196289, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2549, | |
| "step": 8140 | |
| }, | |
| { | |
| "epoch": 0.9288295722944708, | |
| "grad_norm": 0.450860857963562, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2465, | |
| "step": 8160 | |
| }, | |
| { | |
| "epoch": 0.93110611536382, | |
| "grad_norm": 0.4679816663265228, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2326, | |
| "step": 8180 | |
| }, | |
| { | |
| "epoch": 0.9333826584331693, | |
| "grad_norm": 0.5497337579727173, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2457, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.9356592015025185, | |
| "grad_norm": 0.3775748312473297, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2331, | |
| "step": 8220 | |
| }, | |
| { | |
| "epoch": 0.9379357445718676, | |
| "grad_norm": 0.5428327918052673, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2399, | |
| "step": 8240 | |
| }, | |
| { | |
| "epoch": 0.9402122876412168, | |
| "grad_norm": 0.4089830219745636, | |
| "learning_rate": 0.0002, | |
| "loss": 0.246, | |
| "step": 8260 | |
| }, | |
| { | |
| "epoch": 0.942488830710566, | |
| "grad_norm": 0.5781340003013611, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2451, | |
| "step": 8280 | |
| }, | |
| { | |
| "epoch": 0.9447653737799152, | |
| "grad_norm": 0.5869989395141602, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2541, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.9470419168492644, | |
| "grad_norm": 0.47708019614219666, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2559, | |
| "step": 8320 | |
| }, | |
| { | |
| "epoch": 0.9493184599186136, | |
| "grad_norm": 0.5445525050163269, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2466, | |
| "step": 8340 | |
| }, | |
| { | |
| "epoch": 0.9515950029879627, | |
| "grad_norm": 0.480214387178421, | |
| "learning_rate": 0.0002, | |
| "loss": 0.236, | |
| "step": 8360 | |
| }, | |
| { | |
| "epoch": 0.953871546057312, | |
| "grad_norm": 0.5392053127288818, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2383, | |
| "step": 8380 | |
| }, | |
| { | |
| "epoch": 0.9561480891266612, | |
| "grad_norm": 0.4515858292579651, | |
| "learning_rate": 0.0002, | |
| "loss": 0.238, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.9584246321960104, | |
| "grad_norm": 0.5461826324462891, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2442, | |
| "step": 8420 | |
| }, | |
| { | |
| "epoch": 0.9607011752653596, | |
| "grad_norm": 0.44309332966804504, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2622, | |
| "step": 8440 | |
| }, | |
| { | |
| "epoch": 0.9629777183347088, | |
| "grad_norm": 0.5409505367279053, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2303, | |
| "step": 8460 | |
| }, | |
| { | |
| "epoch": 0.9652542614040579, | |
| "grad_norm": 0.3868342638015747, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2624, | |
| "step": 8480 | |
| }, | |
| { | |
| "epoch": 0.9675308044734071, | |
| "grad_norm": 0.38888975977897644, | |
| "learning_rate": 0.0002, | |
| "loss": 0.246, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.9698073475427563, | |
| "grad_norm": 0.38946032524108887, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2503, | |
| "step": 8520 | |
| }, | |
| { | |
| "epoch": 0.9720838906121055, | |
| "grad_norm": 0.42425817251205444, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2556, | |
| "step": 8540 | |
| }, | |
| { | |
| "epoch": 0.9743604336814548, | |
| "grad_norm": 0.41515296697616577, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2437, | |
| "step": 8560 | |
| }, | |
| { | |
| "epoch": 0.9766369767508039, | |
| "grad_norm": 0.4085826575756073, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2293, | |
| "step": 8580 | |
| }, | |
| { | |
| "epoch": 0.9789135198201531, | |
| "grad_norm": 0.3404542803764343, | |
| "learning_rate": 0.0002, | |
| "loss": 0.242, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.9811900628895023, | |
| "grad_norm": 0.43266579508781433, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2513, | |
| "step": 8620 | |
| }, | |
| { | |
| "epoch": 0.9834666059588515, | |
| "grad_norm": 0.42724549770355225, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2384, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 0.9857431490282007, | |
| "grad_norm": 0.5089221596717834, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2409, | |
| "step": 8660 | |
| }, | |
| { | |
| "epoch": 0.9880196920975499, | |
| "grad_norm": 0.519223690032959, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2353, | |
| "step": 8680 | |
| }, | |
| { | |
| "epoch": 0.990296235166899, | |
| "grad_norm": 0.5701056122779846, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2486, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.9925727782362482, | |
| "grad_norm": 0.4519595503807068, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2374, | |
| "step": 8720 | |
| }, | |
| { | |
| "epoch": 0.9948493213055974, | |
| "grad_norm": 0.4883946180343628, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2441, | |
| "step": 8740 | |
| }, | |
| { | |
| "epoch": 0.9971258643749467, | |
| "grad_norm": 0.6918900012969971, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2403, | |
| "step": 8760 | |
| }, | |
| { | |
| "epoch": 0.9994024074442959, | |
| "grad_norm": 0.4810091555118561, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2334, | |
| "step": 8780 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.30941203236579895, | |
| "eval_runtime": 408.7196, | |
| "eval_samples_per_second": 7.083, | |
| "eval_steps_per_second": 0.886, | |
| "step": 8786 | |
| } | |
| ], | |
| "logging_steps": 20, | |
| "max_steps": 13000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 77, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.923169198364426e+18, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |