| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 20, | |
| "global_step": 12178, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0016424069473813874, | |
| "grad_norm": 0.4667005240917206, | |
| "learning_rate": 0.0002, | |
| "loss": 1.9661, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.003284813894762775, | |
| "grad_norm": 0.5031771063804626, | |
| "learning_rate": 0.0002, | |
| "loss": 1.602, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.004927220842144162, | |
| "grad_norm": 0.4090685546398163, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4703, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.00656962778952555, | |
| "grad_norm": 0.4099690020084381, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3652, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.008212034736906937, | |
| "grad_norm": 0.4610142111778259, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4386, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.009854441684288324, | |
| "grad_norm": 0.3908289968967438, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3151, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.011496848631669712, | |
| "grad_norm": 0.4541659951210022, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1233, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.0131392555790511, | |
| "grad_norm": 0.43324407935142517, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1266, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.014781662526432487, | |
| "grad_norm": 0.3396519720554352, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1004, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.016424069473813873, | |
| "grad_norm": 0.5125846266746521, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1258, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.01806647642119526, | |
| "grad_norm": 0.4572688937187195, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1796, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.01970888336857665, | |
| "grad_norm": 0.434186190366745, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1016, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.021351290315958036, | |
| "grad_norm": 0.5205552577972412, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0419, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.022993697263339424, | |
| "grad_norm": 0.3958785831928253, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9515, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.02463610421072081, | |
| "grad_norm": 0.46327391266822815, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0079, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.0262785111581022, | |
| "grad_norm": 0.39861008524894714, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9755, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.027920918105483587, | |
| "grad_norm": 0.42074650526046753, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9435, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.029563325052864974, | |
| "grad_norm": 0.41754183173179626, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9376, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.031205732000246362, | |
| "grad_norm": 0.3933572769165039, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9489, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.032848138947627746, | |
| "grad_norm": 0.4244033992290497, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9759, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.034490545895009134, | |
| "grad_norm": 0.3638761639595032, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9371, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.03613295284239052, | |
| "grad_norm": 0.4706399738788605, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8464, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.03777535978977191, | |
| "grad_norm": 0.4349803328514099, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8918, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.0394177667371533, | |
| "grad_norm": 0.3831111490726471, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8366, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.041060173684534684, | |
| "grad_norm": 0.4122432470321655, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8444, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.04270258063191607, | |
| "grad_norm": 0.3296256959438324, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8301, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.04434498757929746, | |
| "grad_norm": 0.3447166979312897, | |
| "learning_rate": 0.0002, | |
| "loss": 0.857, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.04598739452667885, | |
| "grad_norm": 0.4408610761165619, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8356, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.047629801474060235, | |
| "grad_norm": 0.4657248854637146, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7525, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.04927220842144162, | |
| "grad_norm": 0.35138434171676636, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7486, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.05091461536882301, | |
| "grad_norm": 0.4687822461128235, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8169, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.0525570223162044, | |
| "grad_norm": 0.465108186006546, | |
| "learning_rate": 0.0002, | |
| "loss": 0.738, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.054199429263585785, | |
| "grad_norm": 0.3954925835132599, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7627, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.05584183621096717, | |
| "grad_norm": 0.5010778307914734, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7273, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.05748424315834856, | |
| "grad_norm": 0.6221648454666138, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7506, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.05912665010572995, | |
| "grad_norm": 0.4075715243816376, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7587, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.060769057053111336, | |
| "grad_norm": 0.4346787631511688, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7627, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.062411464000492724, | |
| "grad_norm": 0.4146323800086975, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6642, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.06405387094787411, | |
| "grad_norm": 0.4093219041824341, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7148, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.06569627789525549, | |
| "grad_norm": 0.4016498327255249, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6522, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.06733868484263689, | |
| "grad_norm": 0.436252236366272, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6884, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.06898109179001827, | |
| "grad_norm": 0.4362093508243561, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7185, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.07062349873739966, | |
| "grad_norm": 0.42092448472976685, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6702, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.07226590568478104, | |
| "grad_norm": 0.4649953842163086, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6753, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.07390831263216244, | |
| "grad_norm": 0.4321405589580536, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6578, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.07555071957954382, | |
| "grad_norm": 0.5045340657234192, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6993, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.07719312652692521, | |
| "grad_norm": 0.5063377022743225, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6654, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.0788355334743066, | |
| "grad_norm": 0.41710513830184937, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6264, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.08047794042168799, | |
| "grad_norm": 0.4204249083995819, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6683, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.08212034736906937, | |
| "grad_norm": 0.44983726739883423, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6592, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.08376275431645076, | |
| "grad_norm": 0.5991094708442688, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6197, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.08540516126383214, | |
| "grad_norm": 0.3672972619533539, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5656, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.08704756821121354, | |
| "grad_norm": 0.503656804561615, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6017, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.08868997515859492, | |
| "grad_norm": 0.49204686284065247, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6421, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.09033238210597631, | |
| "grad_norm": 0.45617127418518066, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6176, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.0919747890533577, | |
| "grad_norm": 0.49607595801353455, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5595, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.09361719600073909, | |
| "grad_norm": 0.39171984791755676, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5479, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.09525960294812047, | |
| "grad_norm": 0.4964667558670044, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5937, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.09690200989550186, | |
| "grad_norm": 0.40392565727233887, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5888, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.09854441684288325, | |
| "grad_norm": 0.4721887409687042, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5345, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.10018682379026464, | |
| "grad_norm": 0.4130144417285919, | |
| "learning_rate": 0.0002, | |
| "loss": 0.599, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.10182923073764602, | |
| "grad_norm": 0.4222985506057739, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5762, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.1034716376850274, | |
| "grad_norm": 0.47171750664711, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5619, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.1051140446324088, | |
| "grad_norm": 0.40906137228012085, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5137, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.10675645157979018, | |
| "grad_norm": 0.43774527311325073, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5888, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.10839885852717157, | |
| "grad_norm": 0.5423911213874817, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5409, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.11004126547455295, | |
| "grad_norm": 0.4405030906200409, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5248, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.11168367242193435, | |
| "grad_norm": 0.4299491345882416, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5196, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.11332607936931573, | |
| "grad_norm": 0.5445800423622131, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5524, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.11496848631669712, | |
| "grad_norm": 0.42257580161094666, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5266, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.1166108932640785, | |
| "grad_norm": 0.4614318907260895, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5593, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.1182533002114599, | |
| "grad_norm": 0.5021907687187195, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5183, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.11989570715884128, | |
| "grad_norm": 0.39399659633636475, | |
| "learning_rate": 0.0002, | |
| "loss": 0.516, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.12153811410622267, | |
| "grad_norm": 0.5128427743911743, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5067, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.12318052105360405, | |
| "grad_norm": 0.41359153389930725, | |
| "learning_rate": 0.0002, | |
| "loss": 0.508, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.12482292800098545, | |
| "grad_norm": 0.5723029375076294, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4955, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.12646533494836684, | |
| "grad_norm": 0.4619792699813843, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5398, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.12810774189574822, | |
| "grad_norm": 0.5200566649436951, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5213, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.1297501488431296, | |
| "grad_norm": 0.4156297445297241, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4895, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.13139255579051098, | |
| "grad_norm": 0.43649184703826904, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4809, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.1330349627378924, | |
| "grad_norm": 0.38926875591278076, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4819, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.13467736968527377, | |
| "grad_norm": 0.45897549390792847, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4619, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.13631977663265515, | |
| "grad_norm": 0.4487549364566803, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4737, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.13796218358003653, | |
| "grad_norm": 0.36948007345199585, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4576, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.13960459052741794, | |
| "grad_norm": 0.38834378123283386, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4464, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.14124699747479932, | |
| "grad_norm": 0.5436655879020691, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4616, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.1428894044221807, | |
| "grad_norm": 0.3576355278491974, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4669, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.14453181136956209, | |
| "grad_norm": 0.4736698269844055, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4788, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.1461742183169435, | |
| "grad_norm": 0.4074772596359253, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4214, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.14781662526432487, | |
| "grad_norm": 0.4454910457134247, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4407, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.14945903221170626, | |
| "grad_norm": 0.4039610028266907, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4585, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.15110143915908764, | |
| "grad_norm": 0.4431604743003845, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4483, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.15274384610646902, | |
| "grad_norm": 0.4190782606601715, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4516, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.15438625305385043, | |
| "grad_norm": 0.2951456606388092, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4584, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.1560286600012318, | |
| "grad_norm": 0.4400006830692291, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4533, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.1576710669486132, | |
| "grad_norm": 0.3839446008205414, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4489, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.15931347389599457, | |
| "grad_norm": 0.41484808921813965, | |
| "learning_rate": 0.0002, | |
| "loss": 0.422, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.16095588084337598, | |
| "grad_norm": 0.5211725831031799, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4379, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.16259828779075736, | |
| "grad_norm": 0.3866327106952667, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4279, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.16424069473813874, | |
| "grad_norm": 0.3327186107635498, | |
| "learning_rate": 0.0002, | |
| "loss": 0.417, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.16588310168552012, | |
| "grad_norm": 0.46427205204963684, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4411, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.16752550863290153, | |
| "grad_norm": 0.4826524257659912, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4359, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.1691679155802829, | |
| "grad_norm": 0.4641328454017639, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4691, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.1708103225276643, | |
| "grad_norm": 0.525749683380127, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4297, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.17245272947504567, | |
| "grad_norm": 0.45604804158210754, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4411, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.17409513642242708, | |
| "grad_norm": 0.3894326984882355, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4098, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.17573754336980846, | |
| "grad_norm": 0.34401944279670715, | |
| "learning_rate": 0.0002, | |
| "loss": 0.406, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.17737995031718984, | |
| "grad_norm": 0.3576812148094177, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4024, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.17902235726457122, | |
| "grad_norm": 0.4276871979236603, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4085, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.18066476421195263, | |
| "grad_norm": 0.49007973074913025, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4104, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.182307171159334, | |
| "grad_norm": 0.4573257267475128, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4041, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.1839495781067154, | |
| "grad_norm": 0.4118468463420868, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3984, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.18559198505409677, | |
| "grad_norm": 0.357284277677536, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4212, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.18723439200147818, | |
| "grad_norm": 0.4252781867980957, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3924, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.18887679894885956, | |
| "grad_norm": 0.40546557307243347, | |
| "learning_rate": 0.0002, | |
| "loss": 0.398, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.19051920589624094, | |
| "grad_norm": 0.4305673837661743, | |
| "learning_rate": 0.0002, | |
| "loss": 0.398, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.19216161284362232, | |
| "grad_norm": 0.40348726511001587, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4031, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.19380401979100373, | |
| "grad_norm": 0.48159924149513245, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3926, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.1954464267383851, | |
| "grad_norm": 0.5939348936080933, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3963, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.1970888336857665, | |
| "grad_norm": 0.42593804001808167, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3925, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.19873124063314787, | |
| "grad_norm": 0.515277624130249, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3753, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.20037364758052928, | |
| "grad_norm": 0.43423864245414734, | |
| "learning_rate": 0.0002, | |
| "loss": 0.396, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.20201605452791066, | |
| "grad_norm": 0.3857817053794861, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3834, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.20365846147529204, | |
| "grad_norm": 0.3945648670196533, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3768, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.20530086842267342, | |
| "grad_norm": 0.46411946415901184, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3852, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.2069432753700548, | |
| "grad_norm": 0.3779551684856415, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3767, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.2085856823174362, | |
| "grad_norm": 0.4743368625640869, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4253, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.2102280892648176, | |
| "grad_norm": 0.4278275668621063, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3558, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.21187049621219897, | |
| "grad_norm": 0.42412903904914856, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3934, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.21351290315958035, | |
| "grad_norm": 7.02437162399292, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3972, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.21515531010696176, | |
| "grad_norm": 0.46447402238845825, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3742, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.21679771705434314, | |
| "grad_norm": 0.4078330993652344, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3954, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.21844012400172452, | |
| "grad_norm": 0.39751455187797546, | |
| "learning_rate": 0.0002, | |
| "loss": 0.36, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.2200825309491059, | |
| "grad_norm": 0.4075968265533447, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3894, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.2217249378964873, | |
| "grad_norm": 0.39630162715911865, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3748, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.2233673448438687, | |
| "grad_norm": 0.42885056138038635, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3496, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.22500975179125007, | |
| "grad_norm": 0.4635525941848755, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3494, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.22665215873863145, | |
| "grad_norm": 0.48458898067474365, | |
| "learning_rate": 0.0002, | |
| "loss": 0.387, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.22829456568601286, | |
| "grad_norm": 0.49742501974105835, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3717, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.22993697263339424, | |
| "grad_norm": 0.4279645085334778, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3537, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.23157937958077562, | |
| "grad_norm": 0.5221889615058899, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3676, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.233221786528157, | |
| "grad_norm": 0.5390656590461731, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3439, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.2348641934755384, | |
| "grad_norm": 0.4269630014896393, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3663, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.2365066004229198, | |
| "grad_norm": 0.37411990761756897, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3779, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.23814900737030117, | |
| "grad_norm": 0.3186222016811371, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3513, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.23979141431768256, | |
| "grad_norm": 0.33270496129989624, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3534, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.24143382126506396, | |
| "grad_norm": 0.4496273100376129, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3588, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.24307622821244534, | |
| "grad_norm": 0.35411253571510315, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3466, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.24471863515982673, | |
| "grad_norm": 0.4333256185054779, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3555, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.2463610421072081, | |
| "grad_norm": 0.3264130651950836, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3345, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.24800344905458951, | |
| "grad_norm": 0.3925504684448242, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3559, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.2496458560019709, | |
| "grad_norm": 0.4186360836029053, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3458, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.2512882629493523, | |
| "grad_norm": 0.4656223952770233, | |
| "learning_rate": 0.0002, | |
| "loss": 0.349, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.2529306698967337, | |
| "grad_norm": 0.4535064399242401, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3474, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.25457307684411506, | |
| "grad_norm": 0.37564146518707275, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3454, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.25621548379149645, | |
| "grad_norm": 0.36363497376441956, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3515, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.2578578907388778, | |
| "grad_norm": 0.380750447511673, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3653, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.2595002976862592, | |
| "grad_norm": 0.3188472092151642, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3596, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.2611427046336406, | |
| "grad_norm": 0.4478905200958252, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3567, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.26278511158102197, | |
| "grad_norm": 0.4925800859928131, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3466, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.26442751852840335, | |
| "grad_norm": 0.3702840209007263, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3327, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.2660699254757848, | |
| "grad_norm": 0.35024309158325195, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3524, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.26771233242316617, | |
| "grad_norm": 0.4079764783382416, | |
| "learning_rate": 0.0002, | |
| "loss": 0.338, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.26935473937054755, | |
| "grad_norm": 0.4466266632080078, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3465, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.2709971463179289, | |
| "grad_norm": 0.4438311457633972, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3396, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.2726395532653103, | |
| "grad_norm": 0.37101468443870544, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3392, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.2742819602126917, | |
| "grad_norm": 0.41411712765693665, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3341, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.27592436716007307, | |
| "grad_norm": 0.47411611676216125, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3355, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.27756677410745445, | |
| "grad_norm": 0.4871801733970642, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3627, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.2792091810548359, | |
| "grad_norm": 0.47128844261169434, | |
| "learning_rate": 0.0002, | |
| "loss": 0.324, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.28085158800221727, | |
| "grad_norm": 0.4556843042373657, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3443, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.28249399494959865, | |
| "grad_norm": 0.3775945007801056, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3401, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.28413640189698003, | |
| "grad_norm": 0.377316415309906, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3478, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.2857788088443614, | |
| "grad_norm": 0.336944580078125, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3382, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.2874212157917428, | |
| "grad_norm": 0.4296940863132477, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3361, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.28906362273912417, | |
| "grad_norm": 0.4638020396232605, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3583, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.29070602968650555, | |
| "grad_norm": 0.4074634313583374, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3601, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.292348436633887, | |
| "grad_norm": 0.3634164035320282, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3216, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.29399084358126837, | |
| "grad_norm": 0.43480202555656433, | |
| "learning_rate": 0.0002, | |
| "loss": 0.33, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.29563325052864975, | |
| "grad_norm": 0.42778658866882324, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3408, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.29727565747603113, | |
| "grad_norm": 0.3778844177722931, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3309, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.2989180644234125, | |
| "grad_norm": 0.33491814136505127, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3011, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.3005604713707939, | |
| "grad_norm": 0.5079118609428406, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3079, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 0.30220287831817527, | |
| "grad_norm": 0.3751799166202545, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3286, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.30384528526555665, | |
| "grad_norm": 0.4447515904903412, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2991, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.30548769221293803, | |
| "grad_norm": 0.33741819858551025, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3169, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 0.30713009916031947, | |
| "grad_norm": 0.3624327480792999, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3213, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 0.30877250610770085, | |
| "grad_norm": 0.5299442410469055, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3476, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 0.31041491305508223, | |
| "grad_norm": 0.3178050220012665, | |
| "learning_rate": 0.0002, | |
| "loss": 0.329, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 0.3120573200024636, | |
| "grad_norm": 0.3178127408027649, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3046, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.313699726949845, | |
| "grad_norm": 0.4366089403629303, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3179, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 0.3153421338972264, | |
| "grad_norm": 0.47534024715423584, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3377, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.31698454084460775, | |
| "grad_norm": 0.4247181713581085, | |
| "learning_rate": 0.0002, | |
| "loss": 0.311, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 0.31862694779198913, | |
| "grad_norm": 0.5085952877998352, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3197, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 0.32026935473937057, | |
| "grad_norm": 0.3649958372116089, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3243, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.32191176168675195, | |
| "grad_norm": 0.43816304206848145, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3232, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 0.32355416863413333, | |
| "grad_norm": 0.32603034377098083, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3155, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 0.3251965755815147, | |
| "grad_norm": 0.4867421090602875, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3102, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 0.3268389825288961, | |
| "grad_norm": 0.3843926191329956, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3035, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 0.3284813894762775, | |
| "grad_norm": 0.49313676357269287, | |
| "learning_rate": 0.0002, | |
| "loss": 0.322, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.33012379642365886, | |
| "grad_norm": 0.4102085530757904, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3206, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 0.33176620337104024, | |
| "grad_norm": 0.47901496291160583, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3131, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 0.33340861031842167, | |
| "grad_norm": 0.40674644708633423, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3091, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 0.33505101726580305, | |
| "grad_norm": 0.44038107991218567, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3116, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 0.33669342421318443, | |
| "grad_norm": 0.3919316828250885, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3077, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.3383358311605658, | |
| "grad_norm": 0.38622769713401794, | |
| "learning_rate": 0.0002, | |
| "loss": 0.302, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 0.3399782381079472, | |
| "grad_norm": 0.4685916602611542, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3234, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 0.3416206450553286, | |
| "grad_norm": 0.3348797559738159, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3205, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 0.34326305200270996, | |
| "grad_norm": 0.4265504777431488, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3101, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 0.34490545895009134, | |
| "grad_norm": 0.4005930423736572, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3096, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.3465478658974728, | |
| "grad_norm": 0.4154227674007416, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3188, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 0.34819027284485415, | |
| "grad_norm": 0.30359068512916565, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2966, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 0.34983267979223553, | |
| "grad_norm": 0.35363709926605225, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3189, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 0.3514750867396169, | |
| "grad_norm": 0.43156126141548157, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2951, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 0.3531174936869983, | |
| "grad_norm": 0.4593096077442169, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3048, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.3547599006343797, | |
| "grad_norm": 0.49352073669433594, | |
| "learning_rate": 0.0002, | |
| "loss": 0.301, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 0.35640230758176106, | |
| "grad_norm": 0.4053367078304291, | |
| "learning_rate": 0.0002, | |
| "loss": 0.311, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 0.35804471452914244, | |
| "grad_norm": 0.3465437889099121, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3186, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 0.3596871214765238, | |
| "grad_norm": 0.4525587558746338, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3126, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 0.36132952842390526, | |
| "grad_norm": 0.4213342070579529, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3041, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.36297193537128664, | |
| "grad_norm": 0.37421244382858276, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3295, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 0.364614342318668, | |
| "grad_norm": 0.4033282697200775, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3031, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 0.3662567492660494, | |
| "grad_norm": 0.45873841643333435, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2819, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 0.3678991562134308, | |
| "grad_norm": 0.36195841431617737, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2908, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 0.36954156316081216, | |
| "grad_norm": 0.39707615971565247, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3023, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.37118397010819354, | |
| "grad_norm": 0.3999727666378021, | |
| "learning_rate": 0.0002, | |
| "loss": 0.31, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 0.3728263770555749, | |
| "grad_norm": 0.36880913376808167, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3017, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 0.37446878400295636, | |
| "grad_norm": 0.36656180024147034, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3129, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 0.37611119095033774, | |
| "grad_norm": 0.4566299021244049, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3039, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 0.3777535978977191, | |
| "grad_norm": 0.3202304542064667, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2827, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.3793960048451005, | |
| "grad_norm": 0.4553089439868927, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3401, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 0.3810384117924819, | |
| "grad_norm": 0.40536269545555115, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3038, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 0.38268081873986326, | |
| "grad_norm": 0.36675453186035156, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3198, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 0.38432322568724464, | |
| "grad_norm": 0.41660359501838684, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2904, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 0.385965632634626, | |
| "grad_norm": 0.2889881134033203, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3076, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.38760803958200746, | |
| "grad_norm": 0.3077252507209778, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3087, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 0.38925044652938884, | |
| "grad_norm": 0.43053752183914185, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2994, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 0.3908928534767702, | |
| "grad_norm": 0.39978402853012085, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2825, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 0.3925352604241516, | |
| "grad_norm": 0.39721283316612244, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3002, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 0.394177667371533, | |
| "grad_norm": 0.4234716296195984, | |
| "learning_rate": 0.0002, | |
| "loss": 0.281, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.39582007431891436, | |
| "grad_norm": 0.41390299797058105, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3015, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 0.39746248126629574, | |
| "grad_norm": 0.8412930369377136, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3034, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 0.3991048882136771, | |
| "grad_norm": 0.4165583848953247, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2844, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 0.40074729516105856, | |
| "grad_norm": 0.4212113618850708, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2847, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 0.40238970210843994, | |
| "grad_norm": 0.46880143880844116, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2877, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.4040321090558213, | |
| "grad_norm": 0.33470281958580017, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3006, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 0.4056745160032027, | |
| "grad_norm": 0.41939905285835266, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3014, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 0.4073169229505841, | |
| "grad_norm": 0.4031718671321869, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2959, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 0.40895932989796546, | |
| "grad_norm": 0.3611488938331604, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3175, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 0.41060173684534684, | |
| "grad_norm": 0.38445645570755005, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2897, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.4122441437927282, | |
| "grad_norm": 0.3903651833534241, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2716, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 0.4138865507401096, | |
| "grad_norm": 0.39842015504837036, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2987, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 0.41552895768749104, | |
| "grad_norm": 0.4211498200893402, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3027, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 0.4171713646348724, | |
| "grad_norm": 0.4767220914363861, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2897, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 0.4188137715822538, | |
| "grad_norm": 0.4871378242969513, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2874, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.4204561785296352, | |
| "grad_norm": 0.3960734009742737, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2903, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 0.42209858547701656, | |
| "grad_norm": 0.3350552022457123, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2835, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 0.42374099242439794, | |
| "grad_norm": 0.34975695610046387, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3025, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 0.4253833993717793, | |
| "grad_norm": 0.3886794149875641, | |
| "learning_rate": 0.0002, | |
| "loss": 0.289, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 0.4270258063191607, | |
| "grad_norm": 0.4114588797092438, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2802, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.42866821326654214, | |
| "grad_norm": 0.4368172585964203, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2918, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 0.4303106202139235, | |
| "grad_norm": 0.2889314889907837, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2854, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 0.4319530271613049, | |
| "grad_norm": 0.3999134600162506, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2955, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 0.4335954341086863, | |
| "grad_norm": 0.32143938541412354, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2836, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 0.43523784105606766, | |
| "grad_norm": 0.4069638252258301, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2854, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.43688024800344905, | |
| "grad_norm": 0.46609416604042053, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2777, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 0.4385226549508304, | |
| "grad_norm": 0.35112160444259644, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2896, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 0.4401650618982118, | |
| "grad_norm": 0.4243420660495758, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2743, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 0.44180746884559324, | |
| "grad_norm": 0.45615971088409424, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2699, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 0.4434498757929746, | |
| "grad_norm": 0.4836295247077942, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2932, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.445092282740356, | |
| "grad_norm": 0.41774359345436096, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2869, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 0.4467346896877374, | |
| "grad_norm": 0.3904239535331726, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2798, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 0.44837709663511877, | |
| "grad_norm": 0.3867247700691223, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2668, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 0.45001950358250015, | |
| "grad_norm": 0.33975329995155334, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2805, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 0.4516619105298815, | |
| "grad_norm": 0.30403727293014526, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2747, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.4533043174772629, | |
| "grad_norm": 0.4227672219276428, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2699, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 0.4549467244246443, | |
| "grad_norm": 0.38823801279067993, | |
| "learning_rate": 0.0002, | |
| "loss": 0.256, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 0.4565891313720257, | |
| "grad_norm": 0.3460341691970825, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2768, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 0.4582315383194071, | |
| "grad_norm": 0.40843436121940613, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2829, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 0.4598739452667885, | |
| "grad_norm": 0.411004900932312, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2849, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.46151635221416987, | |
| "grad_norm": 0.5354210138320923, | |
| "learning_rate": 0.0002, | |
| "loss": 0.298, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 0.46315875916155125, | |
| "grad_norm": 0.3296845555305481, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2571, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 0.46480116610893263, | |
| "grad_norm": 0.404950350522995, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2843, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 0.466443573056314, | |
| "grad_norm": 0.3697005808353424, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2655, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 0.4680859800036954, | |
| "grad_norm": 0.3465549945831299, | |
| "learning_rate": 0.0002, | |
| "loss": 0.282, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.4697283869510768, | |
| "grad_norm": 0.4802212119102478, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2672, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 0.4713707938984582, | |
| "grad_norm": 0.3909721076488495, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2704, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 0.4730132008458396, | |
| "grad_norm": 0.41303369402885437, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2797, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 0.47465560779322097, | |
| "grad_norm": 0.32934170961380005, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2903, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 0.47629801474060235, | |
| "grad_norm": 0.375072181224823, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2752, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.47794042168798373, | |
| "grad_norm": 0.35390418767929077, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2755, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 0.4795828286353651, | |
| "grad_norm": 0.3856378197669983, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2699, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 0.4812252355827465, | |
| "grad_norm": 0.2624310851097107, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2654, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 0.4828676425301279, | |
| "grad_norm": 0.43709930777549744, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2768, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 0.4845100494775093, | |
| "grad_norm": 0.3971209228038788, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2728, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.4861524564248907, | |
| "grad_norm": 0.3937450647354126, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2836, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 0.48779486337227207, | |
| "grad_norm": 0.3925333023071289, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2653, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 0.48943727031965345, | |
| "grad_norm": 0.3056396245956421, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2593, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 0.49107967726703483, | |
| "grad_norm": 0.349110871553421, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2872, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 0.4927220842144162, | |
| "grad_norm": 0.37678685784339905, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2779, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.4943644911617976, | |
| "grad_norm": 0.37364938855171204, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2612, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 0.49600689810917903, | |
| "grad_norm": 0.3885985016822815, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2701, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 0.4976493050565604, | |
| "grad_norm": 0.4726998507976532, | |
| "learning_rate": 0.0002, | |
| "loss": 0.258, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 0.4992917120039418, | |
| "grad_norm": 0.3752720355987549, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2873, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 0.5009341189513231, | |
| "grad_norm": 0.5174003839492798, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2677, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.5025765258987046, | |
| "grad_norm": 0.39343810081481934, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2498, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 0.504218932846086, | |
| "grad_norm": 0.3367049992084503, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2555, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 0.5058613397934674, | |
| "grad_norm": 0.3384205400943756, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2865, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 0.5075037467408487, | |
| "grad_norm": 0.37642723321914673, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2677, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 0.5091461536882301, | |
| "grad_norm": 0.31989771127700806, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2675, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.5107885606356115, | |
| "grad_norm": 0.30809977650642395, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2562, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 0.5124309675829929, | |
| "grad_norm": 0.3463954031467438, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2576, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 0.5140733745303743, | |
| "grad_norm": 0.3789072036743164, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2679, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 0.5157157814777557, | |
| "grad_norm": 0.458978533744812, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2596, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 0.517358188425137, | |
| "grad_norm": 0.3515280783176422, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2629, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.5190005953725184, | |
| "grad_norm": 0.42611977458000183, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2674, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 0.5206430023198998, | |
| "grad_norm": 0.3865070641040802, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2714, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 0.5222854092672812, | |
| "grad_norm": 0.3559401333332062, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2751, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 0.5239278162146626, | |
| "grad_norm": 0.3181537389755249, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2724, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 0.5255702231620439, | |
| "grad_norm": 0.37673598527908325, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2711, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.5272126301094253, | |
| "grad_norm": 0.44122573733329773, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2617, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 0.5288550370568067, | |
| "grad_norm": 0.4779141843318939, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2602, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 0.5304974440041882, | |
| "grad_norm": 0.3975127339363098, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2472, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 0.5321398509515696, | |
| "grad_norm": 0.3808406591415405, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2623, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 0.533782257898951, | |
| "grad_norm": 0.340666264295578, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2806, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.5354246648463323, | |
| "grad_norm": 0.41233885288238525, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2458, | |
| "step": 6520 | |
| }, | |
| { | |
| "epoch": 0.5370670717937137, | |
| "grad_norm": 0.28576114773750305, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2638, | |
| "step": 6540 | |
| }, | |
| { | |
| "epoch": 0.5387094787410951, | |
| "grad_norm": 0.4704492688179016, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2735, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 0.5403518856884765, | |
| "grad_norm": 0.43339604139328003, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2667, | |
| "step": 6580 | |
| }, | |
| { | |
| "epoch": 0.5419942926358579, | |
| "grad_norm": 0.332878440618515, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2513, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.5436366995832392, | |
| "grad_norm": 0.34620800614356995, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2768, | |
| "step": 6620 | |
| }, | |
| { | |
| "epoch": 0.5452791065306206, | |
| "grad_norm": 0.46673691272735596, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2597, | |
| "step": 6640 | |
| }, | |
| { | |
| "epoch": 0.546921513478002, | |
| "grad_norm": 0.36888402700424194, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2453, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 0.5485639204253834, | |
| "grad_norm": 0.363007515668869, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2545, | |
| "step": 6680 | |
| }, | |
| { | |
| "epoch": 0.5502063273727648, | |
| "grad_norm": 0.3927077353000641, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2597, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.5518487343201461, | |
| "grad_norm": 0.36897674202919006, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2571, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 0.5534911412675275, | |
| "grad_norm": 0.3425733149051666, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2624, | |
| "step": 6740 | |
| }, | |
| { | |
| "epoch": 0.5551335482149089, | |
| "grad_norm": 0.3315962553024292, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2656, | |
| "step": 6760 | |
| }, | |
| { | |
| "epoch": 0.5567759551622903, | |
| "grad_norm": 0.4456098675727844, | |
| "learning_rate": 0.0002, | |
| "loss": 0.266, | |
| "step": 6780 | |
| }, | |
| { | |
| "epoch": 0.5584183621096718, | |
| "grad_norm": 0.4146248996257782, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2631, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.5600607690570532, | |
| "grad_norm": 0.3591421842575073, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2475, | |
| "step": 6820 | |
| }, | |
| { | |
| "epoch": 0.5617031760044345, | |
| "grad_norm": 0.4540598690509796, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2667, | |
| "step": 6840 | |
| }, | |
| { | |
| "epoch": 0.5633455829518159, | |
| "grad_norm": 0.4394567906856537, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2673, | |
| "step": 6860 | |
| }, | |
| { | |
| "epoch": 0.5649879898991973, | |
| "grad_norm": 0.3273297846317291, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2631, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 0.5666303968465787, | |
| "grad_norm": 0.3828592896461487, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2601, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.5682728037939601, | |
| "grad_norm": 0.24124163389205933, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2507, | |
| "step": 6920 | |
| }, | |
| { | |
| "epoch": 0.5699152107413414, | |
| "grad_norm": 0.4403514564037323, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2686, | |
| "step": 6940 | |
| }, | |
| { | |
| "epoch": 0.5715576176887228, | |
| "grad_norm": 0.39177918434143066, | |
| "learning_rate": 0.0002, | |
| "loss": 0.255, | |
| "step": 6960 | |
| }, | |
| { | |
| "epoch": 0.5732000246361042, | |
| "grad_norm": 0.41621333360671997, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2472, | |
| "step": 6980 | |
| }, | |
| { | |
| "epoch": 0.5748424315834856, | |
| "grad_norm": 0.4051215648651123, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2692, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.576484838530867, | |
| "grad_norm": 0.9351252317428589, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2519, | |
| "step": 7020 | |
| }, | |
| { | |
| "epoch": 0.5781272454782483, | |
| "grad_norm": 0.38004037737846375, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2683, | |
| "step": 7040 | |
| }, | |
| { | |
| "epoch": 0.5797696524256297, | |
| "grad_norm": 0.31271103024482727, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2554, | |
| "step": 7060 | |
| }, | |
| { | |
| "epoch": 0.5814120593730111, | |
| "grad_norm": 0.3766959607601166, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2555, | |
| "step": 7080 | |
| }, | |
| { | |
| "epoch": 0.5830544663203925, | |
| "grad_norm": 2.4575226306915283, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2673, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.584696873267774, | |
| "grad_norm": 0.3419061005115509, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2484, | |
| "step": 7120 | |
| }, | |
| { | |
| "epoch": 0.5863392802151554, | |
| "grad_norm": 0.3647725284099579, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2614, | |
| "step": 7140 | |
| }, | |
| { | |
| "epoch": 0.5879816871625367, | |
| "grad_norm": 0.39643993973731995, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2583, | |
| "step": 7160 | |
| }, | |
| { | |
| "epoch": 0.5896240941099181, | |
| "grad_norm": 0.37024736404418945, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2605, | |
| "step": 7180 | |
| }, | |
| { | |
| "epoch": 0.5912665010572995, | |
| "grad_norm": 0.4551810324192047, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2512, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.5929089080046809, | |
| "grad_norm": 0.2843814492225647, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2504, | |
| "step": 7220 | |
| }, | |
| { | |
| "epoch": 0.5945513149520623, | |
| "grad_norm": 0.3765452206134796, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2557, | |
| "step": 7240 | |
| }, | |
| { | |
| "epoch": 0.5961937218994436, | |
| "grad_norm": 0.4625066816806793, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2433, | |
| "step": 7260 | |
| }, | |
| { | |
| "epoch": 0.597836128846825, | |
| "grad_norm": 0.4870743453502655, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2494, | |
| "step": 7280 | |
| }, | |
| { | |
| "epoch": 0.5994785357942064, | |
| "grad_norm": 0.4229605197906494, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2553, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.6011209427415878, | |
| "grad_norm": 0.37593892216682434, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2523, | |
| "step": 7320 | |
| }, | |
| { | |
| "epoch": 0.6027633496889692, | |
| "grad_norm": 0.36149609088897705, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2582, | |
| "step": 7340 | |
| }, | |
| { | |
| "epoch": 0.6044057566363505, | |
| "grad_norm": 0.3866046071052551, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2534, | |
| "step": 7360 | |
| }, | |
| { | |
| "epoch": 0.6060481635837319, | |
| "grad_norm": 0.4623259902000427, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2542, | |
| "step": 7380 | |
| }, | |
| { | |
| "epoch": 0.6076905705311133, | |
| "grad_norm": 0.32349276542663574, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2437, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.6093329774784947, | |
| "grad_norm": 0.386561781167984, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2494, | |
| "step": 7420 | |
| }, | |
| { | |
| "epoch": 0.6109753844258761, | |
| "grad_norm": 0.36509180068969727, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2559, | |
| "step": 7440 | |
| }, | |
| { | |
| "epoch": 0.6126177913732576, | |
| "grad_norm": 0.3628571331501007, | |
| "learning_rate": 0.0002, | |
| "loss": 0.26, | |
| "step": 7460 | |
| }, | |
| { | |
| "epoch": 0.6142601983206389, | |
| "grad_norm": 0.3218732476234436, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2487, | |
| "step": 7480 | |
| }, | |
| { | |
| "epoch": 0.6159026052680203, | |
| "grad_norm": 0.3551442623138428, | |
| "learning_rate": 0.0002, | |
| "loss": 0.231, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.6175450122154017, | |
| "grad_norm": 0.40962496399879456, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2486, | |
| "step": 7520 | |
| }, | |
| { | |
| "epoch": 0.6191874191627831, | |
| "grad_norm": 0.48531442880630493, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2547, | |
| "step": 7540 | |
| }, | |
| { | |
| "epoch": 0.6208298261101645, | |
| "grad_norm": 0.387851357460022, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2655, | |
| "step": 7560 | |
| }, | |
| { | |
| "epoch": 0.6224722330575458, | |
| "grad_norm": 0.3165546953678131, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2499, | |
| "step": 7580 | |
| }, | |
| { | |
| "epoch": 0.6241146400049272, | |
| "grad_norm": 0.3393017649650574, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2546, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.6257570469523086, | |
| "grad_norm": 0.3975006639957428, | |
| "learning_rate": 0.0002, | |
| "loss": 0.255, | |
| "step": 7620 | |
| }, | |
| { | |
| "epoch": 0.62739945389969, | |
| "grad_norm": 0.4458036720752716, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2671, | |
| "step": 7640 | |
| }, | |
| { | |
| "epoch": 0.6290418608470714, | |
| "grad_norm": 0.34977594017982483, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2438, | |
| "step": 7660 | |
| }, | |
| { | |
| "epoch": 0.6306842677944527, | |
| "grad_norm": 0.4126521646976471, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2473, | |
| "step": 7680 | |
| }, | |
| { | |
| "epoch": 0.6323266747418341, | |
| "grad_norm": 0.35712817311286926, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2568, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.6339690816892155, | |
| "grad_norm": 0.3464488983154297, | |
| "learning_rate": 0.0002, | |
| "loss": 0.26, | |
| "step": 7720 | |
| }, | |
| { | |
| "epoch": 0.6356114886365969, | |
| "grad_norm": 0.40559422969818115, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2531, | |
| "step": 7740 | |
| }, | |
| { | |
| "epoch": 0.6372538955839783, | |
| "grad_norm": 0.3709222972393036, | |
| "learning_rate": 0.0002, | |
| "loss": 0.257, | |
| "step": 7760 | |
| }, | |
| { | |
| "epoch": 0.6388963025313598, | |
| "grad_norm": 0.3671443462371826, | |
| "learning_rate": 0.0002, | |
| "loss": 0.243, | |
| "step": 7780 | |
| }, | |
| { | |
| "epoch": 0.6405387094787411, | |
| "grad_norm": 0.39361605048179626, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2569, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.6421811164261225, | |
| "grad_norm": 0.41323602199554443, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2465, | |
| "step": 7820 | |
| }, | |
| { | |
| "epoch": 0.6438235233735039, | |
| "grad_norm": 0.4266330301761627, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2495, | |
| "step": 7840 | |
| }, | |
| { | |
| "epoch": 0.6454659303208853, | |
| "grad_norm": 0.3892604112625122, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2505, | |
| "step": 7860 | |
| }, | |
| { | |
| "epoch": 0.6471083372682667, | |
| "grad_norm": 0.43539443612098694, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2643, | |
| "step": 7880 | |
| }, | |
| { | |
| "epoch": 0.648750744215648, | |
| "grad_norm": 0.3637757897377014, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2557, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.6503931511630294, | |
| "grad_norm": 0.42761602997779846, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2578, | |
| "step": 7920 | |
| }, | |
| { | |
| "epoch": 0.6520355581104108, | |
| "grad_norm": 0.38917163014411926, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2593, | |
| "step": 7940 | |
| }, | |
| { | |
| "epoch": 0.6536779650577922, | |
| "grad_norm": 0.42814767360687256, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2412, | |
| "step": 7960 | |
| }, | |
| { | |
| "epoch": 0.6553203720051736, | |
| "grad_norm": 0.3543958365917206, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2485, | |
| "step": 7980 | |
| }, | |
| { | |
| "epoch": 0.656962778952555, | |
| "grad_norm": 0.3452099859714508, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2519, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.6586051858999363, | |
| "grad_norm": 0.38600897789001465, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2443, | |
| "step": 8020 | |
| }, | |
| { | |
| "epoch": 0.6602475928473177, | |
| "grad_norm": 0.35474061965942383, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2435, | |
| "step": 8040 | |
| }, | |
| { | |
| "epoch": 0.6618899997946991, | |
| "grad_norm": 0.48493891954421997, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2564, | |
| "step": 8060 | |
| }, | |
| { | |
| "epoch": 0.6635324067420805, | |
| "grad_norm": 0.40137720108032227, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2592, | |
| "step": 8080 | |
| }, | |
| { | |
| "epoch": 0.6651748136894619, | |
| "grad_norm": 0.38460877537727356, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2387, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.6668172206368433, | |
| "grad_norm": 0.3780753016471863, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2517, | |
| "step": 8120 | |
| }, | |
| { | |
| "epoch": 0.6684596275842247, | |
| "grad_norm": 0.30384665727615356, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2442, | |
| "step": 8140 | |
| }, | |
| { | |
| "epoch": 0.6701020345316061, | |
| "grad_norm": 0.34080567955970764, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2443, | |
| "step": 8160 | |
| }, | |
| { | |
| "epoch": 0.6717444414789875, | |
| "grad_norm": 0.3789510130882263, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2462, | |
| "step": 8180 | |
| }, | |
| { | |
| "epoch": 0.6733868484263689, | |
| "grad_norm": 0.3566538989543915, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2418, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.6750292553737502, | |
| "grad_norm": 0.3436945676803589, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2353, | |
| "step": 8220 | |
| }, | |
| { | |
| "epoch": 0.6766716623211316, | |
| "grad_norm": 0.35046547651290894, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2521, | |
| "step": 8240 | |
| }, | |
| { | |
| "epoch": 0.678314069268513, | |
| "grad_norm": 0.3671397566795349, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2505, | |
| "step": 8260 | |
| }, | |
| { | |
| "epoch": 0.6799564762158944, | |
| "grad_norm": 0.33368802070617676, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2663, | |
| "step": 8280 | |
| }, | |
| { | |
| "epoch": 0.6815988831632758, | |
| "grad_norm": 0.35810762643814087, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2467, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.6832412901106572, | |
| "grad_norm": 0.3913412094116211, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2544, | |
| "step": 8320 | |
| }, | |
| { | |
| "epoch": 0.6848836970580385, | |
| "grad_norm": 0.3313830494880676, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2551, | |
| "step": 8340 | |
| }, | |
| { | |
| "epoch": 0.6865261040054199, | |
| "grad_norm": 0.3506488502025604, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2416, | |
| "step": 8360 | |
| }, | |
| { | |
| "epoch": 0.6881685109528013, | |
| "grad_norm": 0.3841126561164856, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2531, | |
| "step": 8380 | |
| }, | |
| { | |
| "epoch": 0.6898109179001827, | |
| "grad_norm": 0.38030919432640076, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2374, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.691453324847564, | |
| "grad_norm": 0.3643128573894501, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2616, | |
| "step": 8420 | |
| }, | |
| { | |
| "epoch": 0.6930957317949455, | |
| "grad_norm": 0.37401241064071655, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2424, | |
| "step": 8440 | |
| }, | |
| { | |
| "epoch": 0.6947381387423269, | |
| "grad_norm": 0.42304474115371704, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2491, | |
| "step": 8460 | |
| }, | |
| { | |
| "epoch": 0.6963805456897083, | |
| "grad_norm": 0.3441920280456543, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2429, | |
| "step": 8480 | |
| }, | |
| { | |
| "epoch": 0.6980229526370897, | |
| "grad_norm": 0.33383867144584656, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2361, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.6996653595844711, | |
| "grad_norm": 0.42935657501220703, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2598, | |
| "step": 8520 | |
| }, | |
| { | |
| "epoch": 0.7013077665318525, | |
| "grad_norm": 0.5143205523490906, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2348, | |
| "step": 8540 | |
| }, | |
| { | |
| "epoch": 0.7029501734792338, | |
| "grad_norm": 0.37915435433387756, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2277, | |
| "step": 8560 | |
| }, | |
| { | |
| "epoch": 0.7045925804266152, | |
| "grad_norm": 0.3202255666255951, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2474, | |
| "step": 8580 | |
| }, | |
| { | |
| "epoch": 0.7062349873739966, | |
| "grad_norm": 0.3681676387786865, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2417, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.707877394321378, | |
| "grad_norm": 0.41214585304260254, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2356, | |
| "step": 8620 | |
| }, | |
| { | |
| "epoch": 0.7095198012687594, | |
| "grad_norm": 0.35259029269218445, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2394, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 0.7111622082161407, | |
| "grad_norm": 0.47768017649650574, | |
| "learning_rate": 0.0002, | |
| "loss": 0.248, | |
| "step": 8660 | |
| }, | |
| { | |
| "epoch": 0.7128046151635221, | |
| "grad_norm": 0.3282839059829712, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2336, | |
| "step": 8680 | |
| }, | |
| { | |
| "epoch": 0.7144470221109035, | |
| "grad_norm": 0.441099613904953, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2631, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.7160894290582849, | |
| "grad_norm": 0.3486292362213135, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2531, | |
| "step": 8720 | |
| }, | |
| { | |
| "epoch": 0.7177318360056663, | |
| "grad_norm": 0.33037880063056946, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2405, | |
| "step": 8740 | |
| }, | |
| { | |
| "epoch": 0.7193742429530476, | |
| "grad_norm": 0.47114354372024536, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2665, | |
| "step": 8760 | |
| }, | |
| { | |
| "epoch": 0.7210166499004291, | |
| "grad_norm": 0.34797531366348267, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2481, | |
| "step": 8780 | |
| }, | |
| { | |
| "epoch": 0.7226590568478105, | |
| "grad_norm": 0.43183642625808716, | |
| "learning_rate": 0.0002, | |
| "loss": 0.242, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.7243014637951919, | |
| "grad_norm": 0.4230342507362366, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2363, | |
| "step": 8820 | |
| }, | |
| { | |
| "epoch": 0.7259438707425733, | |
| "grad_norm": 0.40553364157676697, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2422, | |
| "step": 8840 | |
| }, | |
| { | |
| "epoch": 0.7275862776899547, | |
| "grad_norm": 0.34155145287513733, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2422, | |
| "step": 8860 | |
| }, | |
| { | |
| "epoch": 0.729228684637336, | |
| "grad_norm": 0.4095294773578644, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2605, | |
| "step": 8880 | |
| }, | |
| { | |
| "epoch": 0.7308710915847174, | |
| "grad_norm": 0.36541318893432617, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2516, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.7325134985320988, | |
| "grad_norm": 0.40149998664855957, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2515, | |
| "step": 8920 | |
| }, | |
| { | |
| "epoch": 0.7341559054794802, | |
| "grad_norm": 0.3220469653606415, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2361, | |
| "step": 8940 | |
| }, | |
| { | |
| "epoch": 0.7357983124268616, | |
| "grad_norm": 0.3153376579284668, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2325, | |
| "step": 8960 | |
| }, | |
| { | |
| "epoch": 0.7374407193742429, | |
| "grad_norm": 0.3046116530895233, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2502, | |
| "step": 8980 | |
| }, | |
| { | |
| "epoch": 0.7390831263216243, | |
| "grad_norm": 0.502663791179657, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2471, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.7407255332690057, | |
| "grad_norm": 0.35168886184692383, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2309, | |
| "step": 9020 | |
| }, | |
| { | |
| "epoch": 0.7423679402163871, | |
| "grad_norm": 0.43629148602485657, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2423, | |
| "step": 9040 | |
| }, | |
| { | |
| "epoch": 0.7440103471637685, | |
| "grad_norm": 0.35909175872802734, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2453, | |
| "step": 9060 | |
| }, | |
| { | |
| "epoch": 0.7456527541111498, | |
| "grad_norm": 0.3052688539028168, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2413, | |
| "step": 9080 | |
| }, | |
| { | |
| "epoch": 0.7472951610585313, | |
| "grad_norm": 0.2708439230918884, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2237, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.7489375680059127, | |
| "grad_norm": 0.3965560495853424, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2423, | |
| "step": 9120 | |
| }, | |
| { | |
| "epoch": 0.7505799749532941, | |
| "grad_norm": 0.3895662724971771, | |
| "learning_rate": 0.0002, | |
| "loss": 0.249, | |
| "step": 9140 | |
| }, | |
| { | |
| "epoch": 0.7522223819006755, | |
| "grad_norm": 0.32124513387680054, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2376, | |
| "step": 9160 | |
| }, | |
| { | |
| "epoch": 0.7538647888480569, | |
| "grad_norm": 0.716029167175293, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2529, | |
| "step": 9180 | |
| }, | |
| { | |
| "epoch": 0.7555071957954382, | |
| "grad_norm": 0.3812948167324066, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2269, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.7571496027428196, | |
| "grad_norm": 0.37073054909706116, | |
| "learning_rate": 0.0002, | |
| "loss": 0.235, | |
| "step": 9220 | |
| }, | |
| { | |
| "epoch": 0.758792009690201, | |
| "grad_norm": 0.4043092727661133, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2345, | |
| "step": 9240 | |
| }, | |
| { | |
| "epoch": 0.7604344166375824, | |
| "grad_norm": 0.3160434365272522, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2412, | |
| "step": 9260 | |
| }, | |
| { | |
| "epoch": 0.7620768235849638, | |
| "grad_norm": 0.35415521264076233, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2358, | |
| "step": 9280 | |
| }, | |
| { | |
| "epoch": 0.7637192305323451, | |
| "grad_norm": 0.41371211409568787, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2317, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.7653616374797265, | |
| "grad_norm": 0.4175126850605011, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2547, | |
| "step": 9320 | |
| }, | |
| { | |
| "epoch": 0.7670040444271079, | |
| "grad_norm": 0.39811649918556213, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2462, | |
| "step": 9340 | |
| }, | |
| { | |
| "epoch": 0.7686464513744893, | |
| "grad_norm": 0.33596447110176086, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2368, | |
| "step": 9360 | |
| }, | |
| { | |
| "epoch": 0.7702888583218707, | |
| "grad_norm": 0.36754104495048523, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2484, | |
| "step": 9380 | |
| }, | |
| { | |
| "epoch": 0.771931265269252, | |
| "grad_norm": 0.38244250416755676, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2364, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.7735736722166334, | |
| "grad_norm": 0.3366243839263916, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2194, | |
| "step": 9420 | |
| }, | |
| { | |
| "epoch": 0.7752160791640149, | |
| "grad_norm": 0.39877885580062866, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2469, | |
| "step": 9440 | |
| }, | |
| { | |
| "epoch": 0.7768584861113963, | |
| "grad_norm": 0.2690157890319824, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2459, | |
| "step": 9460 | |
| }, | |
| { | |
| "epoch": 0.7785008930587777, | |
| "grad_norm": 0.3678382337093353, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2192, | |
| "step": 9480 | |
| }, | |
| { | |
| "epoch": 0.7801433000061591, | |
| "grad_norm": 0.3121150732040405, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2438, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.7817857069535404, | |
| "grad_norm": 0.3517535626888275, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2495, | |
| "step": 9520 | |
| }, | |
| { | |
| "epoch": 0.7834281139009218, | |
| "grad_norm": 0.434817910194397, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2532, | |
| "step": 9540 | |
| }, | |
| { | |
| "epoch": 0.7850705208483032, | |
| "grad_norm": 0.35570958256721497, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2467, | |
| "step": 9560 | |
| }, | |
| { | |
| "epoch": 0.7867129277956846, | |
| "grad_norm": 0.4270517826080322, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2337, | |
| "step": 9580 | |
| }, | |
| { | |
| "epoch": 0.788355334743066, | |
| "grad_norm": 0.2827800214290619, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2309, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.7899977416904473, | |
| "grad_norm": 0.39158400893211365, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2366, | |
| "step": 9620 | |
| }, | |
| { | |
| "epoch": 0.7916401486378287, | |
| "grad_norm": 0.32538673281669617, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2389, | |
| "step": 9640 | |
| }, | |
| { | |
| "epoch": 0.7932825555852101, | |
| "grad_norm": 0.3370015323162079, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2377, | |
| "step": 9660 | |
| }, | |
| { | |
| "epoch": 0.7949249625325915, | |
| "grad_norm": 0.3779650032520294, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2339, | |
| "step": 9680 | |
| }, | |
| { | |
| "epoch": 0.7965673694799729, | |
| "grad_norm": 0.36034300923347473, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2427, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.7982097764273542, | |
| "grad_norm": 0.3154286742210388, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2338, | |
| "step": 9720 | |
| }, | |
| { | |
| "epoch": 0.7998521833747356, | |
| "grad_norm": 0.3282501697540283, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2408, | |
| "step": 9740 | |
| }, | |
| { | |
| "epoch": 0.8014945903221171, | |
| "grad_norm": 0.41291025280952454, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2507, | |
| "step": 9760 | |
| }, | |
| { | |
| "epoch": 0.8031369972694985, | |
| "grad_norm": 0.3961363136768341, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2281, | |
| "step": 9780 | |
| }, | |
| { | |
| "epoch": 0.8047794042168799, | |
| "grad_norm": 0.47485384345054626, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2349, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.8064218111642613, | |
| "grad_norm": 0.3284982740879059, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2288, | |
| "step": 9820 | |
| }, | |
| { | |
| "epoch": 0.8080642181116426, | |
| "grad_norm": 0.38867270946502686, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2328, | |
| "step": 9840 | |
| }, | |
| { | |
| "epoch": 0.809706625059024, | |
| "grad_norm": 0.44371268153190613, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2416, | |
| "step": 9860 | |
| }, | |
| { | |
| "epoch": 0.8113490320064054, | |
| "grad_norm": 0.2462434470653534, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2391, | |
| "step": 9880 | |
| }, | |
| { | |
| "epoch": 0.8129914389537868, | |
| "grad_norm": 0.31762421131134033, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2467, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.8146338459011682, | |
| "grad_norm": 0.40011724829673767, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2351, | |
| "step": 9920 | |
| }, | |
| { | |
| "epoch": 0.8162762528485495, | |
| "grad_norm": 0.2972090542316437, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2469, | |
| "step": 9940 | |
| }, | |
| { | |
| "epoch": 0.8179186597959309, | |
| "grad_norm": 0.4047238230705261, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2257, | |
| "step": 9960 | |
| }, | |
| { | |
| "epoch": 0.8195610667433123, | |
| "grad_norm": 0.36663326621055603, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2302, | |
| "step": 9980 | |
| }, | |
| { | |
| "epoch": 0.8212034736906937, | |
| "grad_norm": 0.49191904067993164, | |
| "learning_rate": 0.0002, | |
| "loss": 0.242, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.8228458806380751, | |
| "grad_norm": 0.4621546268463135, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2324, | |
| "step": 10020 | |
| }, | |
| { | |
| "epoch": 0.8244882875854564, | |
| "grad_norm": 0.4055505394935608, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2373, | |
| "step": 10040 | |
| }, | |
| { | |
| "epoch": 0.8261306945328378, | |
| "grad_norm": 0.34892845153808594, | |
| "learning_rate": 0.0002, | |
| "loss": 0.23, | |
| "step": 10060 | |
| }, | |
| { | |
| "epoch": 0.8277731014802192, | |
| "grad_norm": 0.33453091979026794, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2348, | |
| "step": 10080 | |
| }, | |
| { | |
| "epoch": 0.8294155084276007, | |
| "grad_norm": 0.3283565640449524, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2314, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 0.8310579153749821, | |
| "grad_norm": 0.35970717668533325, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2336, | |
| "step": 10120 | |
| }, | |
| { | |
| "epoch": 0.8327003223223635, | |
| "grad_norm": 0.3093232810497284, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2363, | |
| "step": 10140 | |
| }, | |
| { | |
| "epoch": 0.8343427292697448, | |
| "grad_norm": 0.4389066696166992, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2422, | |
| "step": 10160 | |
| }, | |
| { | |
| "epoch": 0.8359851362171262, | |
| "grad_norm": 0.44654580950737, | |
| "learning_rate": 0.0002, | |
| "loss": 0.232, | |
| "step": 10180 | |
| }, | |
| { | |
| "epoch": 0.8376275431645076, | |
| "grad_norm": 0.2830391526222229, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2476, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.839269950111889, | |
| "grad_norm": 0.31547674536705017, | |
| "learning_rate": 0.0002, | |
| "loss": 0.231, | |
| "step": 10220 | |
| }, | |
| { | |
| "epoch": 0.8409123570592704, | |
| "grad_norm": 0.45748040080070496, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2372, | |
| "step": 10240 | |
| }, | |
| { | |
| "epoch": 0.8425547640066517, | |
| "grad_norm": 0.34882062673568726, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2376, | |
| "step": 10260 | |
| }, | |
| { | |
| "epoch": 0.8441971709540331, | |
| "grad_norm": 0.3529532849788666, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2323, | |
| "step": 10280 | |
| }, | |
| { | |
| "epoch": 0.8458395779014145, | |
| "grad_norm": 0.33054473996162415, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2376, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 0.8474819848487959, | |
| "grad_norm": 0.3015061616897583, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2243, | |
| "step": 10320 | |
| }, | |
| { | |
| "epoch": 0.8491243917961773, | |
| "grad_norm": 0.3048664629459381, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2318, | |
| "step": 10340 | |
| }, | |
| { | |
| "epoch": 0.8507667987435586, | |
| "grad_norm": 0.31459841132164, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2307, | |
| "step": 10360 | |
| }, | |
| { | |
| "epoch": 0.85240920569094, | |
| "grad_norm": 0.39160168170928955, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2407, | |
| "step": 10380 | |
| }, | |
| { | |
| "epoch": 0.8540516126383214, | |
| "grad_norm": 0.30392590165138245, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2206, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.8556940195857029, | |
| "grad_norm": 0.3656589686870575, | |
| "learning_rate": 0.0002, | |
| "loss": 0.229, | |
| "step": 10420 | |
| }, | |
| { | |
| "epoch": 0.8573364265330843, | |
| "grad_norm": 0.35856541991233826, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2361, | |
| "step": 10440 | |
| }, | |
| { | |
| "epoch": 0.8589788334804657, | |
| "grad_norm": 0.3591729402542114, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2232, | |
| "step": 10460 | |
| }, | |
| { | |
| "epoch": 0.860621240427847, | |
| "grad_norm": 0.36023178696632385, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2495, | |
| "step": 10480 | |
| }, | |
| { | |
| "epoch": 0.8622636473752284, | |
| "grad_norm": 0.38790059089660645, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2288, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.8639060543226098, | |
| "grad_norm": 0.39627397060394287, | |
| "learning_rate": 0.0002, | |
| "loss": 0.24, | |
| "step": 10520 | |
| }, | |
| { | |
| "epoch": 0.8655484612699912, | |
| "grad_norm": 0.32167407870292664, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2365, | |
| "step": 10540 | |
| }, | |
| { | |
| "epoch": 0.8671908682173726, | |
| "grad_norm": 0.34265172481536865, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2419, | |
| "step": 10560 | |
| }, | |
| { | |
| "epoch": 0.868833275164754, | |
| "grad_norm": 0.3236486613750458, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2326, | |
| "step": 10580 | |
| }, | |
| { | |
| "epoch": 0.8704756821121353, | |
| "grad_norm": 0.3700607120990753, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2361, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 0.8721180890595167, | |
| "grad_norm": 0.33969688415527344, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2236, | |
| "step": 10620 | |
| }, | |
| { | |
| "epoch": 0.8737604960068981, | |
| "grad_norm": 0.2824096083641052, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2415, | |
| "step": 10640 | |
| }, | |
| { | |
| "epoch": 0.8754029029542795, | |
| "grad_norm": 0.3842727243900299, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2223, | |
| "step": 10660 | |
| }, | |
| { | |
| "epoch": 0.8770453099016609, | |
| "grad_norm": 0.36808887124061584, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2253, | |
| "step": 10680 | |
| }, | |
| { | |
| "epoch": 0.8786877168490422, | |
| "grad_norm": 0.4065176844596863, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2274, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 0.8803301237964236, | |
| "grad_norm": 0.3421749174594879, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2309, | |
| "step": 10720 | |
| }, | |
| { | |
| "epoch": 0.881972530743805, | |
| "grad_norm": 0.30610519647598267, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2213, | |
| "step": 10740 | |
| }, | |
| { | |
| "epoch": 0.8836149376911865, | |
| "grad_norm": 0.40341177582740784, | |
| "learning_rate": 0.0002, | |
| "loss": 0.229, | |
| "step": 10760 | |
| }, | |
| { | |
| "epoch": 0.8852573446385679, | |
| "grad_norm": 0.43038755655288696, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2312, | |
| "step": 10780 | |
| }, | |
| { | |
| "epoch": 0.8868997515859492, | |
| "grad_norm": 0.26736319065093994, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2375, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 0.8885421585333306, | |
| "grad_norm": 0.34479281306266785, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2342, | |
| "step": 10820 | |
| }, | |
| { | |
| "epoch": 0.890184565480712, | |
| "grad_norm": 0.32857152819633484, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2352, | |
| "step": 10840 | |
| }, | |
| { | |
| "epoch": 0.8918269724280934, | |
| "grad_norm": 0.30919578671455383, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2133, | |
| "step": 10860 | |
| }, | |
| { | |
| "epoch": 0.8934693793754748, | |
| "grad_norm": 0.3049899637699127, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2374, | |
| "step": 10880 | |
| }, | |
| { | |
| "epoch": 0.8951117863228562, | |
| "grad_norm": 0.4088539779186249, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2377, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 0.8967541932702375, | |
| "grad_norm": 0.3318689167499542, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2459, | |
| "step": 10920 | |
| }, | |
| { | |
| "epoch": 0.8983966002176189, | |
| "grad_norm": 0.38051754236221313, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2305, | |
| "step": 10940 | |
| }, | |
| { | |
| "epoch": 0.9000390071650003, | |
| "grad_norm": 0.401080846786499, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2297, | |
| "step": 10960 | |
| }, | |
| { | |
| "epoch": 0.9016814141123817, | |
| "grad_norm": 0.30713602900505066, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2254, | |
| "step": 10980 | |
| }, | |
| { | |
| "epoch": 0.903323821059763, | |
| "grad_norm": 0.37888234853744507, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2346, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.9049662280071444, | |
| "grad_norm": 0.3106231689453125, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2206, | |
| "step": 11020 | |
| }, | |
| { | |
| "epoch": 0.9066086349545258, | |
| "grad_norm": 0.44297677278518677, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2218, | |
| "step": 11040 | |
| }, | |
| { | |
| "epoch": 0.9082510419019072, | |
| "grad_norm": 0.3375784456729889, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2273, | |
| "step": 11060 | |
| }, | |
| { | |
| "epoch": 0.9098934488492886, | |
| "grad_norm": 0.4860747158527374, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2317, | |
| "step": 11080 | |
| }, | |
| { | |
| "epoch": 0.9115358557966701, | |
| "grad_norm": 0.2880633771419525, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2398, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 0.9131782627440514, | |
| "grad_norm": 0.4085402190685272, | |
| "learning_rate": 0.0002, | |
| "loss": 0.234, | |
| "step": 11120 | |
| }, | |
| { | |
| "epoch": 0.9148206696914328, | |
| "grad_norm": 0.38998520374298096, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2402, | |
| "step": 11140 | |
| }, | |
| { | |
| "epoch": 0.9164630766388142, | |
| "grad_norm": 0.40508535504341125, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2136, | |
| "step": 11160 | |
| }, | |
| { | |
| "epoch": 0.9181054835861956, | |
| "grad_norm": 0.3789615035057068, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2267, | |
| "step": 11180 | |
| }, | |
| { | |
| "epoch": 0.919747890533577, | |
| "grad_norm": 0.3882130980491638, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2276, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 0.9213902974809584, | |
| "grad_norm": 0.3001303970813751, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2313, | |
| "step": 11220 | |
| }, | |
| { | |
| "epoch": 0.9230327044283397, | |
| "grad_norm": 0.4514042139053345, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2204, | |
| "step": 11240 | |
| }, | |
| { | |
| "epoch": 0.9246751113757211, | |
| "grad_norm": 0.43372517824172974, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2294, | |
| "step": 11260 | |
| }, | |
| { | |
| "epoch": 0.9263175183231025, | |
| "grad_norm": 0.2934057414531708, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2308, | |
| "step": 11280 | |
| }, | |
| { | |
| "epoch": 0.9279599252704839, | |
| "grad_norm": 0.4067831337451935, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2329, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 0.9296023322178653, | |
| "grad_norm": 0.3299509584903717, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2214, | |
| "step": 11320 | |
| }, | |
| { | |
| "epoch": 0.9312447391652466, | |
| "grad_norm": 0.35204941034317017, | |
| "learning_rate": 0.0002, | |
| "loss": 0.239, | |
| "step": 11340 | |
| }, | |
| { | |
| "epoch": 0.932887146112628, | |
| "grad_norm": 0.30878013372421265, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2248, | |
| "step": 11360 | |
| }, | |
| { | |
| "epoch": 0.9345295530600094, | |
| "grad_norm": 0.392170786857605, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2274, | |
| "step": 11380 | |
| }, | |
| { | |
| "epoch": 0.9361719600073908, | |
| "grad_norm": 0.4151529371738434, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2186, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 0.9378143669547723, | |
| "grad_norm": 0.3535741865634918, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2285, | |
| "step": 11420 | |
| }, | |
| { | |
| "epoch": 0.9394567739021537, | |
| "grad_norm": 0.3477960526943207, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2313, | |
| "step": 11440 | |
| }, | |
| { | |
| "epoch": 0.941099180849535, | |
| "grad_norm": 0.3621846139431, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2317, | |
| "step": 11460 | |
| }, | |
| { | |
| "epoch": 0.9427415877969164, | |
| "grad_norm": 0.3844580352306366, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2345, | |
| "step": 11480 | |
| }, | |
| { | |
| "epoch": 0.9443839947442978, | |
| "grad_norm": 0.3395872116088867, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2233, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.9460264016916792, | |
| "grad_norm": 0.4554111063480377, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2324, | |
| "step": 11520 | |
| }, | |
| { | |
| "epoch": 0.9476688086390606, | |
| "grad_norm": 0.34367838501930237, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2157, | |
| "step": 11540 | |
| }, | |
| { | |
| "epoch": 0.9493112155864419, | |
| "grad_norm": 0.2760342061519623, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2278, | |
| "step": 11560 | |
| }, | |
| { | |
| "epoch": 0.9509536225338233, | |
| "grad_norm": 0.4382875859737396, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2361, | |
| "step": 11580 | |
| }, | |
| { | |
| "epoch": 0.9525960294812047, | |
| "grad_norm": 0.3573220670223236, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2241, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 0.9542384364285861, | |
| "grad_norm": 0.3491596579551697, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2258, | |
| "step": 11620 | |
| }, | |
| { | |
| "epoch": 0.9558808433759675, | |
| "grad_norm": 0.42366743087768555, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2406, | |
| "step": 11640 | |
| }, | |
| { | |
| "epoch": 0.9575232503233488, | |
| "grad_norm": 0.3748779892921448, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2305, | |
| "step": 11660 | |
| }, | |
| { | |
| "epoch": 0.9591656572707302, | |
| "grad_norm": 0.40864527225494385, | |
| "learning_rate": 0.0002, | |
| "loss": 0.235, | |
| "step": 11680 | |
| }, | |
| { | |
| "epoch": 0.9608080642181116, | |
| "grad_norm": 0.41164445877075195, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2195, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 0.962450471165493, | |
| "grad_norm": 0.46402692794799805, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2266, | |
| "step": 11720 | |
| }, | |
| { | |
| "epoch": 0.9640928781128744, | |
| "grad_norm": 0.32727622985839844, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2324, | |
| "step": 11740 | |
| }, | |
| { | |
| "epoch": 0.9657352850602559, | |
| "grad_norm": 0.4346349537372589, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2257, | |
| "step": 11760 | |
| }, | |
| { | |
| "epoch": 0.9673776920076372, | |
| "grad_norm": 0.3470235764980316, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2333, | |
| "step": 11780 | |
| }, | |
| { | |
| "epoch": 0.9690200989550186, | |
| "grad_norm": 0.48941469192504883, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2336, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 0.9706625059024, | |
| "grad_norm": 0.3959124982357025, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2221, | |
| "step": 11820 | |
| }, | |
| { | |
| "epoch": 0.9723049128497814, | |
| "grad_norm": 0.40877676010131836, | |
| "learning_rate": 0.0002, | |
| "loss": 0.232, | |
| "step": 11840 | |
| }, | |
| { | |
| "epoch": 0.9739473197971628, | |
| "grad_norm": 0.4087940454483032, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2195, | |
| "step": 11860 | |
| }, | |
| { | |
| "epoch": 0.9755897267445441, | |
| "grad_norm": 0.3967040181159973, | |
| "learning_rate": 0.0002, | |
| "loss": 0.234, | |
| "step": 11880 | |
| }, | |
| { | |
| "epoch": 0.9772321336919255, | |
| "grad_norm": 0.41639575362205505, | |
| "learning_rate": 0.0002, | |
| "loss": 0.221, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 0.9788745406393069, | |
| "grad_norm": 0.304775595664978, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2283, | |
| "step": 11920 | |
| }, | |
| { | |
| "epoch": 0.9805169475866883, | |
| "grad_norm": 0.41931501030921936, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2263, | |
| "step": 11940 | |
| }, | |
| { | |
| "epoch": 0.9821593545340697, | |
| "grad_norm": 0.34010422229766846, | |
| "learning_rate": 0.0002, | |
| "loss": 0.222, | |
| "step": 11960 | |
| }, | |
| { | |
| "epoch": 0.983801761481451, | |
| "grad_norm": 0.3099174499511719, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2221, | |
| "step": 11980 | |
| }, | |
| { | |
| "epoch": 0.9854441684288324, | |
| "grad_norm": 0.3627716600894928, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2419, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.9870865753762138, | |
| "grad_norm": 0.3797793388366699, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2289, | |
| "step": 12020 | |
| }, | |
| { | |
| "epoch": 0.9887289823235952, | |
| "grad_norm": 0.34914806485176086, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2211, | |
| "step": 12040 | |
| }, | |
| { | |
| "epoch": 0.9903713892709766, | |
| "grad_norm": 0.35985666513442993, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2271, | |
| "step": 12060 | |
| }, | |
| { | |
| "epoch": 0.9920137962183581, | |
| "grad_norm": 0.3159051835536957, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2364, | |
| "step": 12080 | |
| }, | |
| { | |
| "epoch": 0.9936562031657394, | |
| "grad_norm": 0.29203563928604126, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2429, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 0.9952986101131208, | |
| "grad_norm": 0.32187801599502563, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2386, | |
| "step": 12120 | |
| }, | |
| { | |
| "epoch": 0.9969410170605022, | |
| "grad_norm": 0.35564154386520386, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2349, | |
| "step": 12140 | |
| }, | |
| { | |
| "epoch": 0.9985834240078836, | |
| "grad_norm": 0.3589749336242676, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2275, | |
| "step": 12160 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.28475141525268555, | |
| "eval_runtime": 907.1315, | |
| "eval_samples_per_second": 4.174, | |
| "eval_steps_per_second": 0.523, | |
| "step": 12178 | |
| } | |
| ], | |
| "logging_steps": 20, | |
| "max_steps": 16000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 77, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.843715322728153e+18, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |