diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,49952 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 7130, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00014025245441795232, + "grad_norm": 13.661454734360627, + "learning_rate": 4.672897196261682e-08, + "loss": 1.4474, + "step": 1 + }, + { + "epoch": 0.00028050490883590464, + "grad_norm": 11.652364702042846, + "learning_rate": 9.345794392523364e-08, + "loss": 1.3264, + "step": 2 + }, + { + "epoch": 0.00042075736325385696, + "grad_norm": 14.080603581409317, + "learning_rate": 1.4018691588785048e-07, + "loss": 1.3986, + "step": 3 + }, + { + "epoch": 0.0005610098176718093, + "grad_norm": 13.216997612124556, + "learning_rate": 1.8691588785046729e-07, + "loss": 1.4604, + "step": 4 + }, + { + "epoch": 0.0007012622720897616, + "grad_norm": 13.473275754095528, + "learning_rate": 2.3364485981308412e-07, + "loss": 1.4155, + "step": 5 + }, + { + "epoch": 0.0008415147265077139, + "grad_norm": 14.316948211945018, + "learning_rate": 2.8037383177570096e-07, + "loss": 1.409, + "step": 6 + }, + { + "epoch": 0.0009817671809256663, + "grad_norm": 13.272260284759573, + "learning_rate": 3.2710280373831776e-07, + "loss": 1.4131, + "step": 7 + }, + { + "epoch": 0.0011220196353436186, + "grad_norm": 11.733722206376212, + "learning_rate": 3.7383177570093457e-07, + "loss": 1.3955, + "step": 8 + }, + { + "epoch": 0.0012622720897615708, + "grad_norm": 13.253857321048795, + "learning_rate": 4.2056074766355143e-07, + "loss": 1.4297, + "step": 9 + }, + { + "epoch": 0.001402524544179523, + "grad_norm": 15.15472769416775, + "learning_rate": 4.6728971962616824e-07, + "loss": 1.5023, + "step": 10 + }, + { + "epoch": 0.0015427769985974754, + "grad_norm": 14.32592783214941, + "learning_rate": 5.140186915887851e-07, + "loss": 1.3748, + "step": 11 + }, + { + "epoch": 0.0016830294530154279, + "grad_norm": 14.096768142745077, + "learning_rate": 5.607476635514019e-07, + "loss": 1.3932, + "step": 12 + }, + { + "epoch": 0.0018232819074333801, + "grad_norm": 12.327740062510571, + "learning_rate": 6.074766355140187e-07, + "loss": 1.294, + "step": 13 + }, + { + "epoch": 0.0019635343618513326, + "grad_norm": 11.411498150508658, + "learning_rate": 6.542056074766355e-07, + "loss": 1.3779, + "step": 14 + }, + { + "epoch": 0.0021037868162692847, + "grad_norm": 12.706843321120317, + "learning_rate": 7.009345794392523e-07, + "loss": 1.2252, + "step": 15 + }, + { + "epoch": 0.002244039270687237, + "grad_norm": 12.702905181446807, + "learning_rate": 7.476635514018691e-07, + "loss": 1.1854, + "step": 16 + }, + { + "epoch": 0.002384291725105189, + "grad_norm": 11.68784466403558, + "learning_rate": 7.94392523364486e-07, + "loss": 1.1701, + "step": 17 + }, + { + "epoch": 0.0025245441795231417, + "grad_norm": 10.974135769334598, + "learning_rate": 8.411214953271029e-07, + "loss": 1.2076, + "step": 18 + }, + { + "epoch": 0.002664796633941094, + "grad_norm": 10.77184860983223, + "learning_rate": 8.878504672897197e-07, + "loss": 1.1467, + "step": 19 + }, + { + "epoch": 0.002805049088359046, + "grad_norm": 8.746388589268841, + "learning_rate": 9.345794392523365e-07, + "loss": 1.0417, + "step": 20 + }, + { + "epoch": 0.0029453015427769987, + "grad_norm": 7.124787979986887, + "learning_rate": 9.813084112149534e-07, + "loss": 1.0061, + "step": 21 + }, + { + "epoch": 0.0030855539971949507, + "grad_norm": 6.335205035786844, + "learning_rate": 1.0280373831775702e-06, + "loss": 0.9409, + "step": 22 + }, + { + "epoch": 0.0032258064516129032, + "grad_norm": 5.601907418882994, + "learning_rate": 1.074766355140187e-06, + "loss": 0.9028, + "step": 23 + }, + { + "epoch": 0.0033660589060308557, + "grad_norm": 5.160024290397307, + "learning_rate": 1.1214953271028038e-06, + "loss": 0.91, + "step": 24 + }, + { + "epoch": 0.0035063113604488078, + "grad_norm": 4.43592914125005, + "learning_rate": 1.1682242990654206e-06, + "loss": 0.8257, + "step": 25 + }, + { + "epoch": 0.0036465638148667602, + "grad_norm": 5.155283519448136, + "learning_rate": 1.2149532710280374e-06, + "loss": 0.8183, + "step": 26 + }, + { + "epoch": 0.0037868162692847123, + "grad_norm": 4.5863816730169695, + "learning_rate": 1.2616822429906543e-06, + "loss": 0.7743, + "step": 27 + }, + { + "epoch": 0.003927068723702665, + "grad_norm": 5.7306784374979465, + "learning_rate": 1.308411214953271e-06, + "loss": 0.7569, + "step": 28 + }, + { + "epoch": 0.004067321178120617, + "grad_norm": 5.6621386819050095, + "learning_rate": 1.3551401869158879e-06, + "loss": 0.7621, + "step": 29 + }, + { + "epoch": 0.004207573632538569, + "grad_norm": 6.28696830427706, + "learning_rate": 1.4018691588785047e-06, + "loss": 0.8344, + "step": 30 + }, + { + "epoch": 0.004347826086956522, + "grad_norm": 4.527708577028351, + "learning_rate": 1.4485981308411215e-06, + "loss": 0.6995, + "step": 31 + }, + { + "epoch": 0.004488078541374474, + "grad_norm": 3.823721209414868, + "learning_rate": 1.4953271028037383e-06, + "loss": 0.719, + "step": 32 + }, + { + "epoch": 0.004628330995792427, + "grad_norm": 3.7322723981548185, + "learning_rate": 1.542056074766355e-06, + "loss": 0.7347, + "step": 33 + }, + { + "epoch": 0.004768583450210378, + "grad_norm": 3.596050024005017, + "learning_rate": 1.588785046728972e-06, + "loss": 0.7049, + "step": 34 + }, + { + "epoch": 0.004908835904628331, + "grad_norm": 2.8044936769291433, + "learning_rate": 1.6355140186915887e-06, + "loss": 0.7007, + "step": 35 + }, + { + "epoch": 0.005049088359046283, + "grad_norm": 3.320112873134955, + "learning_rate": 1.6822429906542057e-06, + "loss": 0.7024, + "step": 36 + }, + { + "epoch": 0.005189340813464236, + "grad_norm": 3.429517598386339, + "learning_rate": 1.7289719626168225e-06, + "loss": 0.6795, + "step": 37 + }, + { + "epoch": 0.005329593267882188, + "grad_norm": 3.2349477067402863, + "learning_rate": 1.7757009345794394e-06, + "loss": 0.5921, + "step": 38 + }, + { + "epoch": 0.00546984572230014, + "grad_norm": 3.6712892755077213, + "learning_rate": 1.8224299065420562e-06, + "loss": 0.7182, + "step": 39 + }, + { + "epoch": 0.005610098176718092, + "grad_norm": 3.607521539043542, + "learning_rate": 1.869158878504673e-06, + "loss": 0.6387, + "step": 40 + }, + { + "epoch": 0.005750350631136045, + "grad_norm": 3.1687167374710383, + "learning_rate": 1.9158878504672898e-06, + "loss": 0.6739, + "step": 41 + }, + { + "epoch": 0.005890603085553997, + "grad_norm": 3.917755638404281, + "learning_rate": 1.962616822429907e-06, + "loss": 0.6093, + "step": 42 + }, + { + "epoch": 0.00603085553997195, + "grad_norm": 3.1539899836195517, + "learning_rate": 2.0093457943925234e-06, + "loss": 0.6461, + "step": 43 + }, + { + "epoch": 0.0061711079943899015, + "grad_norm": 2.7970553718244044, + "learning_rate": 2.0560747663551404e-06, + "loss": 0.6713, + "step": 44 + }, + { + "epoch": 0.006311360448807854, + "grad_norm": 3.378191110421054, + "learning_rate": 2.102803738317757e-06, + "loss": 0.6094, + "step": 45 + }, + { + "epoch": 0.0064516129032258064, + "grad_norm": 3.113807011606497, + "learning_rate": 2.149532710280374e-06, + "loss": 0.686, + "step": 46 + }, + { + "epoch": 0.006591865357643759, + "grad_norm": 7.931595175396131, + "learning_rate": 2.1962616822429906e-06, + "loss": 0.6347, + "step": 47 + }, + { + "epoch": 0.006732117812061711, + "grad_norm": 2.9221977830774652, + "learning_rate": 2.2429906542056077e-06, + "loss": 0.5823, + "step": 48 + }, + { + "epoch": 0.006872370266479663, + "grad_norm": 2.756270888199664, + "learning_rate": 2.2897196261682247e-06, + "loss": 0.6081, + "step": 49 + }, + { + "epoch": 0.0070126227208976155, + "grad_norm": 2.5633470949449273, + "learning_rate": 2.3364485981308413e-06, + "loss": 0.6211, + "step": 50 + }, + { + "epoch": 0.007152875175315568, + "grad_norm": 2.431050301712823, + "learning_rate": 2.3831775700934583e-06, + "loss": 0.5952, + "step": 51 + }, + { + "epoch": 0.0072931276297335205, + "grad_norm": 2.502156281393821, + "learning_rate": 2.429906542056075e-06, + "loss": 0.5901, + "step": 52 + }, + { + "epoch": 0.007433380084151473, + "grad_norm": 2.8927597823402476, + "learning_rate": 2.476635514018692e-06, + "loss": 0.5322, + "step": 53 + }, + { + "epoch": 0.007573632538569425, + "grad_norm": 2.823575486071377, + "learning_rate": 2.5233644859813085e-06, + "loss": 0.6088, + "step": 54 + }, + { + "epoch": 0.007713884992987377, + "grad_norm": 2.3749801233363357, + "learning_rate": 2.570093457943925e-06, + "loss": 0.5664, + "step": 55 + }, + { + "epoch": 0.00785413744740533, + "grad_norm": 6.038239377321177, + "learning_rate": 2.616822429906542e-06, + "loss": 0.6008, + "step": 56 + }, + { + "epoch": 0.007994389901823282, + "grad_norm": 2.709237744859725, + "learning_rate": 2.6635514018691587e-06, + "loss": 0.546, + "step": 57 + }, + { + "epoch": 0.008134642356241234, + "grad_norm": 2.582624912546111, + "learning_rate": 2.7102803738317757e-06, + "loss": 0.5844, + "step": 58 + }, + { + "epoch": 0.008274894810659187, + "grad_norm": 2.9544317997233085, + "learning_rate": 2.7570093457943923e-06, + "loss": 0.5631, + "step": 59 + }, + { + "epoch": 0.008415147265077139, + "grad_norm": 2.6056831420355455, + "learning_rate": 2.8037383177570094e-06, + "loss": 0.5304, + "step": 60 + }, + { + "epoch": 0.008555399719495092, + "grad_norm": 2.4488024684859915, + "learning_rate": 2.8504672897196264e-06, + "loss": 0.5022, + "step": 61 + }, + { + "epoch": 0.008695652173913044, + "grad_norm": 2.541940882747964, + "learning_rate": 2.897196261682243e-06, + "loss": 0.6031, + "step": 62 + }, + { + "epoch": 0.008835904628330995, + "grad_norm": 2.2663267956034527, + "learning_rate": 2.94392523364486e-06, + "loss": 0.5485, + "step": 63 + }, + { + "epoch": 0.008976157082748949, + "grad_norm": 2.8042865774770886, + "learning_rate": 2.9906542056074766e-06, + "loss": 0.5517, + "step": 64 + }, + { + "epoch": 0.0091164095371669, + "grad_norm": 3.8375715516219837, + "learning_rate": 3.0373831775700936e-06, + "loss": 0.5794, + "step": 65 + }, + { + "epoch": 0.009256661991584854, + "grad_norm": 2.5126065929063692, + "learning_rate": 3.08411214953271e-06, + "loss": 0.5446, + "step": 66 + }, + { + "epoch": 0.009396914446002805, + "grad_norm": 2.98158797101787, + "learning_rate": 3.1308411214953272e-06, + "loss": 0.4908, + "step": 67 + }, + { + "epoch": 0.009537166900420757, + "grad_norm": 2.166071320927866, + "learning_rate": 3.177570093457944e-06, + "loss": 0.5105, + "step": 68 + }, + { + "epoch": 0.00967741935483871, + "grad_norm": 2.5281475504658086, + "learning_rate": 3.224299065420561e-06, + "loss": 0.551, + "step": 69 + }, + { + "epoch": 0.009817671809256662, + "grad_norm": 2.473513004309663, + "learning_rate": 3.2710280373831774e-06, + "loss": 0.5112, + "step": 70 + }, + { + "epoch": 0.009957924263674615, + "grad_norm": 2.359169856367669, + "learning_rate": 3.3177570093457945e-06, + "loss": 0.5559, + "step": 71 + }, + { + "epoch": 0.010098176718092567, + "grad_norm": 2.8093961755211234, + "learning_rate": 3.3644859813084115e-06, + "loss": 0.5144, + "step": 72 + }, + { + "epoch": 0.010238429172510518, + "grad_norm": 2.840281288830847, + "learning_rate": 3.411214953271028e-06, + "loss": 0.5191, + "step": 73 + }, + { + "epoch": 0.010378681626928472, + "grad_norm": 2.1312129132128756, + "learning_rate": 3.457943925233645e-06, + "loss": 0.5524, + "step": 74 + }, + { + "epoch": 0.010518934081346423, + "grad_norm": 3.2541440805847226, + "learning_rate": 3.5046728971962617e-06, + "loss": 0.4996, + "step": 75 + }, + { + "epoch": 0.010659186535764377, + "grad_norm": 2.4664219369712668, + "learning_rate": 3.5514018691588787e-06, + "loss": 0.4917, + "step": 76 + }, + { + "epoch": 0.010799438990182328, + "grad_norm": 1.8505733117148453, + "learning_rate": 3.5981308411214953e-06, + "loss": 0.4527, + "step": 77 + }, + { + "epoch": 0.01093969144460028, + "grad_norm": 3.1081107938600225, + "learning_rate": 3.6448598130841123e-06, + "loss": 0.49, + "step": 78 + }, + { + "epoch": 0.011079943899018233, + "grad_norm": 2.2399443461112014, + "learning_rate": 3.691588785046729e-06, + "loss": 0.4954, + "step": 79 + }, + { + "epoch": 0.011220196353436185, + "grad_norm": 2.426412552464205, + "learning_rate": 3.738317757009346e-06, + "loss": 0.5283, + "step": 80 + }, + { + "epoch": 0.011360448807854138, + "grad_norm": 2.1635754860108283, + "learning_rate": 3.785046728971963e-06, + "loss": 0.5162, + "step": 81 + }, + { + "epoch": 0.01150070126227209, + "grad_norm": 12.432751846692884, + "learning_rate": 3.8317757009345796e-06, + "loss": 0.5213, + "step": 82 + }, + { + "epoch": 0.011640953716690041, + "grad_norm": 2.2515305505586367, + "learning_rate": 3.878504672897196e-06, + "loss": 0.5692, + "step": 83 + }, + { + "epoch": 0.011781206171107995, + "grad_norm": 2.6768165089275433, + "learning_rate": 3.925233644859814e-06, + "loss": 0.4898, + "step": 84 + }, + { + "epoch": 0.011921458625525946, + "grad_norm": 2.375804564070908, + "learning_rate": 3.97196261682243e-06, + "loss": 0.5063, + "step": 85 + }, + { + "epoch": 0.0120617110799439, + "grad_norm": 2.4232912906146264, + "learning_rate": 4.018691588785047e-06, + "loss": 0.5118, + "step": 86 + }, + { + "epoch": 0.012201963534361851, + "grad_norm": 2.137906095064392, + "learning_rate": 4.065420560747663e-06, + "loss": 0.4843, + "step": 87 + }, + { + "epoch": 0.012342215988779803, + "grad_norm": 2.529576631488027, + "learning_rate": 4.112149532710281e-06, + "loss": 0.4882, + "step": 88 + }, + { + "epoch": 0.012482468443197756, + "grad_norm": 2.4081983728663574, + "learning_rate": 4.1588785046728974e-06, + "loss": 0.4941, + "step": 89 + }, + { + "epoch": 0.012622720897615708, + "grad_norm": 2.473316942025269, + "learning_rate": 4.205607476635514e-06, + "loss": 0.4752, + "step": 90 + }, + { + "epoch": 0.012762973352033661, + "grad_norm": 2.542254697806033, + "learning_rate": 4.2523364485981315e-06, + "loss": 0.5384, + "step": 91 + }, + { + "epoch": 0.012903225806451613, + "grad_norm": 2.930920150840522, + "learning_rate": 4.299065420560748e-06, + "loss": 0.4801, + "step": 92 + }, + { + "epoch": 0.013043478260869565, + "grad_norm": 2.4199862098300935, + "learning_rate": 4.345794392523365e-06, + "loss": 0.4703, + "step": 93 + }, + { + "epoch": 0.013183730715287518, + "grad_norm": 2.898503963397338, + "learning_rate": 4.392523364485981e-06, + "loss": 0.5149, + "step": 94 + }, + { + "epoch": 0.01332398316970547, + "grad_norm": 2.4811731014298526, + "learning_rate": 4.439252336448599e-06, + "loss": 0.4948, + "step": 95 + }, + { + "epoch": 0.013464235624123423, + "grad_norm": 2.4104896945380965, + "learning_rate": 4.485981308411215e-06, + "loss": 0.4735, + "step": 96 + }, + { + "epoch": 0.013604488078541374, + "grad_norm": 2.3108481616225283, + "learning_rate": 4.532710280373832e-06, + "loss": 0.4711, + "step": 97 + }, + { + "epoch": 0.013744740532959326, + "grad_norm": 2.4729119480498323, + "learning_rate": 4.579439252336449e-06, + "loss": 0.4596, + "step": 98 + }, + { + "epoch": 0.01388499298737728, + "grad_norm": 3.0604731801015026, + "learning_rate": 4.626168224299066e-06, + "loss": 0.5267, + "step": 99 + }, + { + "epoch": 0.014025245441795231, + "grad_norm": 2.2018374745801137, + "learning_rate": 4.6728971962616825e-06, + "loss": 0.4495, + "step": 100 + }, + { + "epoch": 0.014165497896213184, + "grad_norm": 2.6875785106457837, + "learning_rate": 4.719626168224299e-06, + "loss": 0.5106, + "step": 101 + }, + { + "epoch": 0.014305750350631136, + "grad_norm": 2.4805458357239525, + "learning_rate": 4.766355140186917e-06, + "loss": 0.4712, + "step": 102 + }, + { + "epoch": 0.014446002805049088, + "grad_norm": 2.6687734348241965, + "learning_rate": 4.813084112149533e-06, + "loss": 0.4764, + "step": 103 + }, + { + "epoch": 0.014586255259467041, + "grad_norm": 2.4712394041877164, + "learning_rate": 4.85981308411215e-06, + "loss": 0.522, + "step": 104 + }, + { + "epoch": 0.014726507713884993, + "grad_norm": 2.4421232345628563, + "learning_rate": 4.906542056074766e-06, + "loss": 0.4984, + "step": 105 + }, + { + "epoch": 0.014866760168302946, + "grad_norm": 2.34273476546865, + "learning_rate": 4.953271028037384e-06, + "loss": 0.4553, + "step": 106 + }, + { + "epoch": 0.015007012622720898, + "grad_norm": 2.0392392838236155, + "learning_rate": 5e-06, + "loss": 0.4636, + "step": 107 + }, + { + "epoch": 0.01514726507713885, + "grad_norm": 2.6694824035659046, + "learning_rate": 5.046728971962617e-06, + "loss": 0.4991, + "step": 108 + }, + { + "epoch": 0.015287517531556803, + "grad_norm": 2.283615349201864, + "learning_rate": 5.0934579439252344e-06, + "loss": 0.4207, + "step": 109 + }, + { + "epoch": 0.015427769985974754, + "grad_norm": 3.2743346958115493, + "learning_rate": 5.14018691588785e-06, + "loss": 0.4307, + "step": 110 + }, + { + "epoch": 0.015568022440392707, + "grad_norm": 2.4879573689086465, + "learning_rate": 5.186915887850468e-06, + "loss": 0.4511, + "step": 111 + }, + { + "epoch": 0.01570827489481066, + "grad_norm": 2.93407874028616, + "learning_rate": 5.233644859813084e-06, + "loss": 0.5184, + "step": 112 + }, + { + "epoch": 0.015848527349228612, + "grad_norm": 2.8606874549387165, + "learning_rate": 5.280373831775702e-06, + "loss": 0.5225, + "step": 113 + }, + { + "epoch": 0.015988779803646564, + "grad_norm": 2.1285504244996054, + "learning_rate": 5.3271028037383174e-06, + "loss": 0.47, + "step": 114 + }, + { + "epoch": 0.016129032258064516, + "grad_norm": 2.4979235608979047, + "learning_rate": 5.373831775700935e-06, + "loss": 0.4828, + "step": 115 + }, + { + "epoch": 0.016269284712482467, + "grad_norm": 2.6649907528142474, + "learning_rate": 5.4205607476635515e-06, + "loss": 0.4716, + "step": 116 + }, + { + "epoch": 0.016409537166900422, + "grad_norm": 3.420902479791717, + "learning_rate": 5.467289719626169e-06, + "loss": 0.5071, + "step": 117 + }, + { + "epoch": 0.016549789621318374, + "grad_norm": 2.219265837545501, + "learning_rate": 5.514018691588785e-06, + "loss": 0.4426, + "step": 118 + }, + { + "epoch": 0.016690042075736326, + "grad_norm": 2.3545280336731054, + "learning_rate": 5.560747663551402e-06, + "loss": 0.4866, + "step": 119 + }, + { + "epoch": 0.016830294530154277, + "grad_norm": 2.387420301455272, + "learning_rate": 5.607476635514019e-06, + "loss": 0.452, + "step": 120 + }, + { + "epoch": 0.01697054698457223, + "grad_norm": 2.6728180230586607, + "learning_rate": 5.654205607476636e-06, + "loss": 0.4309, + "step": 121 + }, + { + "epoch": 0.017110799438990184, + "grad_norm": 2.2308907184860374, + "learning_rate": 5.700934579439253e-06, + "loss": 0.5054, + "step": 122 + }, + { + "epoch": 0.017251051893408136, + "grad_norm": 2.810934012274622, + "learning_rate": 5.747663551401869e-06, + "loss": 0.5124, + "step": 123 + }, + { + "epoch": 0.017391304347826087, + "grad_norm": 2.7662963914320975, + "learning_rate": 5.794392523364486e-06, + "loss": 0.5333, + "step": 124 + }, + { + "epoch": 0.01753155680224404, + "grad_norm": 2.4800159912209088, + "learning_rate": 5.841121495327103e-06, + "loss": 0.4727, + "step": 125 + }, + { + "epoch": 0.01767180925666199, + "grad_norm": 2.2948587092370913, + "learning_rate": 5.88785046728972e-06, + "loss": 0.4578, + "step": 126 + }, + { + "epoch": 0.017812061711079945, + "grad_norm": 2.301623567494857, + "learning_rate": 5.9345794392523374e-06, + "loss": 0.4518, + "step": 127 + }, + { + "epoch": 0.017952314165497897, + "grad_norm": 2.3776599644899354, + "learning_rate": 5.981308411214953e-06, + "loss": 0.3798, + "step": 128 + }, + { + "epoch": 0.01809256661991585, + "grad_norm": 2.961128812462258, + "learning_rate": 6.028037383177571e-06, + "loss": 0.4212, + "step": 129 + }, + { + "epoch": 0.0182328190743338, + "grad_norm": 1.9936150914770603, + "learning_rate": 6.074766355140187e-06, + "loss": 0.4897, + "step": 130 + }, + { + "epoch": 0.018373071528751752, + "grad_norm": 2.3537941922148162, + "learning_rate": 6.121495327102805e-06, + "loss": 0.4071, + "step": 131 + }, + { + "epoch": 0.018513323983169707, + "grad_norm": 2.5750306951674746, + "learning_rate": 6.16822429906542e-06, + "loss": 0.4788, + "step": 132 + }, + { + "epoch": 0.01865357643758766, + "grad_norm": 2.4451015003565595, + "learning_rate": 6.214953271028038e-06, + "loss": 0.4962, + "step": 133 + }, + { + "epoch": 0.01879382889200561, + "grad_norm": 2.4963653846476466, + "learning_rate": 6.2616822429906544e-06, + "loss": 0.4788, + "step": 134 + }, + { + "epoch": 0.018934081346423562, + "grad_norm": 3.0003013459490875, + "learning_rate": 6.308411214953272e-06, + "loss": 0.4555, + "step": 135 + }, + { + "epoch": 0.019074333800841514, + "grad_norm": 3.182269423553265, + "learning_rate": 6.355140186915888e-06, + "loss": 0.5034, + "step": 136 + }, + { + "epoch": 0.01921458625525947, + "grad_norm": 2.2024493973945227, + "learning_rate": 6.401869158878505e-06, + "loss": 0.4764, + "step": 137 + }, + { + "epoch": 0.01935483870967742, + "grad_norm": 2.1137250339109706, + "learning_rate": 6.448598130841122e-06, + "loss": 0.486, + "step": 138 + }, + { + "epoch": 0.019495091164095372, + "grad_norm": 3.0070982245947406, + "learning_rate": 6.495327102803739e-06, + "loss": 0.4372, + "step": 139 + }, + { + "epoch": 0.019635343618513323, + "grad_norm": 3.397719033249549, + "learning_rate": 6.542056074766355e-06, + "loss": 0.4585, + "step": 140 + }, + { + "epoch": 0.019775596072931275, + "grad_norm": 2.1466440934520645, + "learning_rate": 6.588785046728972e-06, + "loss": 0.4137, + "step": 141 + }, + { + "epoch": 0.01991584852734923, + "grad_norm": 2.1297944901950254, + "learning_rate": 6.635514018691589e-06, + "loss": 0.447, + "step": 142 + }, + { + "epoch": 0.020056100981767182, + "grad_norm": 2.5290440495622977, + "learning_rate": 6.682242990654206e-06, + "loss": 0.4413, + "step": 143 + }, + { + "epoch": 0.020196353436185133, + "grad_norm": 2.9100510546213667, + "learning_rate": 6.728971962616823e-06, + "loss": 0.4258, + "step": 144 + }, + { + "epoch": 0.020336605890603085, + "grad_norm": 2.6962183218038125, + "learning_rate": 6.77570093457944e-06, + "loss": 0.5215, + "step": 145 + }, + { + "epoch": 0.020476858345021037, + "grad_norm": 2.9961710973352034, + "learning_rate": 6.822429906542056e-06, + "loss": 0.4584, + "step": 146 + }, + { + "epoch": 0.02061711079943899, + "grad_norm": 2.39908951337045, + "learning_rate": 6.869158878504674e-06, + "loss": 0.4675, + "step": 147 + }, + { + "epoch": 0.020757363253856943, + "grad_norm": 2.1663055363495816, + "learning_rate": 6.91588785046729e-06, + "loss": 0.4575, + "step": 148 + }, + { + "epoch": 0.020897615708274895, + "grad_norm": 2.5819320775087506, + "learning_rate": 6.962616822429908e-06, + "loss": 0.4347, + "step": 149 + }, + { + "epoch": 0.021037868162692847, + "grad_norm": 2.786407660383044, + "learning_rate": 7.009345794392523e-06, + "loss": 0.4351, + "step": 150 + }, + { + "epoch": 0.021178120617110798, + "grad_norm": 2.189282372355883, + "learning_rate": 7.056074766355141e-06, + "loss": 0.4187, + "step": 151 + }, + { + "epoch": 0.021318373071528753, + "grad_norm": 2.3930198292849205, + "learning_rate": 7.1028037383177574e-06, + "loss": 0.4296, + "step": 152 + }, + { + "epoch": 0.021458625525946705, + "grad_norm": 3.4065329057993052, + "learning_rate": 7.149532710280375e-06, + "loss": 0.437, + "step": 153 + }, + { + "epoch": 0.021598877980364656, + "grad_norm": 3.1904730301999273, + "learning_rate": 7.196261682242991e-06, + "loss": 0.4148, + "step": 154 + }, + { + "epoch": 0.021739130434782608, + "grad_norm": 2.7429837975324225, + "learning_rate": 7.242990654205608e-06, + "loss": 0.4268, + "step": 155 + }, + { + "epoch": 0.02187938288920056, + "grad_norm": 3.1382514752776776, + "learning_rate": 7.289719626168225e-06, + "loss": 0.4564, + "step": 156 + }, + { + "epoch": 0.022019635343618515, + "grad_norm": 2.5054584994223874, + "learning_rate": 7.336448598130842e-06, + "loss": 0.4777, + "step": 157 + }, + { + "epoch": 0.022159887798036466, + "grad_norm": 2.890473562861722, + "learning_rate": 7.383177570093458e-06, + "loss": 0.4147, + "step": 158 + }, + { + "epoch": 0.022300140252454418, + "grad_norm": 2.7279150813245576, + "learning_rate": 7.429906542056075e-06, + "loss": 0.4773, + "step": 159 + }, + { + "epoch": 0.02244039270687237, + "grad_norm": 2.67332784850334, + "learning_rate": 7.476635514018692e-06, + "loss": 0.454, + "step": 160 + }, + { + "epoch": 0.02258064516129032, + "grad_norm": 3.4481133042856777, + "learning_rate": 7.523364485981309e-06, + "loss": 0.4233, + "step": 161 + }, + { + "epoch": 0.022720897615708276, + "grad_norm": 6.936568463993134, + "learning_rate": 7.570093457943926e-06, + "loss": 0.4393, + "step": 162 + }, + { + "epoch": 0.022861150070126228, + "grad_norm": 3.033072012519631, + "learning_rate": 7.616822429906543e-06, + "loss": 0.4479, + "step": 163 + }, + { + "epoch": 0.02300140252454418, + "grad_norm": 2.4739299464688567, + "learning_rate": 7.663551401869159e-06, + "loss": 0.4697, + "step": 164 + }, + { + "epoch": 0.02314165497896213, + "grad_norm": 2.509415444016097, + "learning_rate": 7.710280373831777e-06, + "loss": 0.4766, + "step": 165 + }, + { + "epoch": 0.023281907433380083, + "grad_norm": 3.1794433500015598, + "learning_rate": 7.757009345794392e-06, + "loss": 0.4068, + "step": 166 + }, + { + "epoch": 0.023422159887798038, + "grad_norm": 2.8367639603685224, + "learning_rate": 7.80373831775701e-06, + "loss": 0.4944, + "step": 167 + }, + { + "epoch": 0.02356241234221599, + "grad_norm": 2.857527506004238, + "learning_rate": 7.850467289719627e-06, + "loss": 0.4659, + "step": 168 + }, + { + "epoch": 0.02370266479663394, + "grad_norm": 2.5049948386559833, + "learning_rate": 7.897196261682244e-06, + "loss": 0.4369, + "step": 169 + }, + { + "epoch": 0.023842917251051893, + "grad_norm": 3.3132809851928977, + "learning_rate": 7.94392523364486e-06, + "loss": 0.4741, + "step": 170 + }, + { + "epoch": 0.023983169705469844, + "grad_norm": 2.4062973612298255, + "learning_rate": 7.990654205607477e-06, + "loss": 0.4718, + "step": 171 + }, + { + "epoch": 0.0241234221598878, + "grad_norm": 3.3146345350785285, + "learning_rate": 8.037383177570094e-06, + "loss": 0.4426, + "step": 172 + }, + { + "epoch": 0.02426367461430575, + "grad_norm": 4.009771570943594, + "learning_rate": 8.084112149532712e-06, + "loss": 0.5129, + "step": 173 + }, + { + "epoch": 0.024403927068723703, + "grad_norm": 3.229490268038816, + "learning_rate": 8.130841121495327e-06, + "loss": 0.397, + "step": 174 + }, + { + "epoch": 0.024544179523141654, + "grad_norm": 2.7474466087145015, + "learning_rate": 8.177570093457945e-06, + "loss": 0.4625, + "step": 175 + }, + { + "epoch": 0.024684431977559606, + "grad_norm": 2.818404040706357, + "learning_rate": 8.224299065420562e-06, + "loss": 0.4419, + "step": 176 + }, + { + "epoch": 0.02482468443197756, + "grad_norm": 2.2949543922230857, + "learning_rate": 8.271028037383178e-06, + "loss": 0.4383, + "step": 177 + }, + { + "epoch": 0.024964936886395513, + "grad_norm": 3.3778996197436735, + "learning_rate": 8.317757009345795e-06, + "loss": 0.4454, + "step": 178 + }, + { + "epoch": 0.025105189340813464, + "grad_norm": 2.5405408106294285, + "learning_rate": 8.364485981308411e-06, + "loss": 0.4624, + "step": 179 + }, + { + "epoch": 0.025245441795231416, + "grad_norm": 2.5942044147310668, + "learning_rate": 8.411214953271028e-06, + "loss": 0.4706, + "step": 180 + }, + { + "epoch": 0.025385694249649367, + "grad_norm": 2.385842619822456, + "learning_rate": 8.457943925233646e-06, + "loss": 0.4364, + "step": 181 + }, + { + "epoch": 0.025525946704067323, + "grad_norm": 2.4913725761274312, + "learning_rate": 8.504672897196263e-06, + "loss": 0.475, + "step": 182 + }, + { + "epoch": 0.025666199158485274, + "grad_norm": 3.344250365280128, + "learning_rate": 8.55140186915888e-06, + "loss": 0.4733, + "step": 183 + }, + { + "epoch": 0.025806451612903226, + "grad_norm": 2.4657216873840873, + "learning_rate": 8.598130841121496e-06, + "loss": 0.4213, + "step": 184 + }, + { + "epoch": 0.025946704067321177, + "grad_norm": 2.723476291890163, + "learning_rate": 8.644859813084113e-06, + "loss": 0.4721, + "step": 185 + }, + { + "epoch": 0.02608695652173913, + "grad_norm": 2.3472765014010473, + "learning_rate": 8.69158878504673e-06, + "loss": 0.4296, + "step": 186 + }, + { + "epoch": 0.026227208976157084, + "grad_norm": 2.4548499973254776, + "learning_rate": 8.738317757009348e-06, + "loss": 0.4524, + "step": 187 + }, + { + "epoch": 0.026367461430575036, + "grad_norm": 3.093154079187258, + "learning_rate": 8.785046728971963e-06, + "loss": 0.4314, + "step": 188 + }, + { + "epoch": 0.026507713884992987, + "grad_norm": 2.4236905091939214, + "learning_rate": 8.83177570093458e-06, + "loss": 0.4645, + "step": 189 + }, + { + "epoch": 0.02664796633941094, + "grad_norm": 2.223185658886129, + "learning_rate": 8.878504672897197e-06, + "loss": 0.4534, + "step": 190 + }, + { + "epoch": 0.02678821879382889, + "grad_norm": 2.4879994375038126, + "learning_rate": 8.925233644859814e-06, + "loss": 0.4601, + "step": 191 + }, + { + "epoch": 0.026928471248246846, + "grad_norm": 2.4159928292237534, + "learning_rate": 8.97196261682243e-06, + "loss": 0.4215, + "step": 192 + }, + { + "epoch": 0.027068723702664797, + "grad_norm": 2.639071938930907, + "learning_rate": 9.018691588785047e-06, + "loss": 0.403, + "step": 193 + }, + { + "epoch": 0.02720897615708275, + "grad_norm": 2.3513794887970962, + "learning_rate": 9.065420560747664e-06, + "loss": 0.4547, + "step": 194 + }, + { + "epoch": 0.0273492286115007, + "grad_norm": 2.648785413547633, + "learning_rate": 9.112149532710282e-06, + "loss": 0.4573, + "step": 195 + }, + { + "epoch": 0.027489481065918652, + "grad_norm": 1.9824272268439151, + "learning_rate": 9.158878504672899e-06, + "loss": 0.4611, + "step": 196 + }, + { + "epoch": 0.027629733520336607, + "grad_norm": 2.106337783750016, + "learning_rate": 9.205607476635515e-06, + "loss": 0.453, + "step": 197 + }, + { + "epoch": 0.02776998597475456, + "grad_norm": 2.754139472338549, + "learning_rate": 9.252336448598132e-06, + "loss": 0.4613, + "step": 198 + }, + { + "epoch": 0.02791023842917251, + "grad_norm": 2.1091617052534097, + "learning_rate": 9.299065420560748e-06, + "loss": 0.4403, + "step": 199 + }, + { + "epoch": 0.028050490883590462, + "grad_norm": 2.6631317750226433, + "learning_rate": 9.345794392523365e-06, + "loss": 0.5109, + "step": 200 + }, + { + "epoch": 0.028190743338008414, + "grad_norm": 2.7530027814879467, + "learning_rate": 9.392523364485983e-06, + "loss": 0.4506, + "step": 201 + }, + { + "epoch": 0.02833099579242637, + "grad_norm": 2.977965016986435, + "learning_rate": 9.439252336448598e-06, + "loss": 0.4316, + "step": 202 + }, + { + "epoch": 0.02847124824684432, + "grad_norm": 2.581687177162465, + "learning_rate": 9.485981308411217e-06, + "loss": 0.4669, + "step": 203 + }, + { + "epoch": 0.028611500701262272, + "grad_norm": 3.023653559887397, + "learning_rate": 9.532710280373833e-06, + "loss": 0.4842, + "step": 204 + }, + { + "epoch": 0.028751753155680224, + "grad_norm": 2.251101647995522, + "learning_rate": 9.57943925233645e-06, + "loss": 0.4696, + "step": 205 + }, + { + "epoch": 0.028892005610098175, + "grad_norm": 4.548572551120584, + "learning_rate": 9.626168224299066e-06, + "loss": 0.4198, + "step": 206 + }, + { + "epoch": 0.02903225806451613, + "grad_norm": 2.4447480058990565, + "learning_rate": 9.672897196261683e-06, + "loss": 0.5004, + "step": 207 + }, + { + "epoch": 0.029172510518934082, + "grad_norm": 2.4963083667454917, + "learning_rate": 9.7196261682243e-06, + "loss": 0.4501, + "step": 208 + }, + { + "epoch": 0.029312762973352034, + "grad_norm": 3.389083441550459, + "learning_rate": 9.766355140186918e-06, + "loss": 0.4711, + "step": 209 + }, + { + "epoch": 0.029453015427769985, + "grad_norm": 2.44389999592741, + "learning_rate": 9.813084112149533e-06, + "loss": 0.4404, + "step": 210 + }, + { + "epoch": 0.029593267882187937, + "grad_norm": 2.7659040407263054, + "learning_rate": 9.859813084112151e-06, + "loss": 0.4667, + "step": 211 + }, + { + "epoch": 0.029733520336605892, + "grad_norm": 2.3237430374180734, + "learning_rate": 9.906542056074768e-06, + "loss": 0.4111, + "step": 212 + }, + { + "epoch": 0.029873772791023843, + "grad_norm": 3.530638110054946, + "learning_rate": 9.953271028037384e-06, + "loss": 0.4329, + "step": 213 + }, + { + "epoch": 0.030014025245441795, + "grad_norm": 4.789895413079215, + "learning_rate": 1e-05, + "loss": 0.4408, + "step": 214 + }, + { + "epoch": 0.030154277699859747, + "grad_norm": 4.241107838409851, + "learning_rate": 9.999999484142467e-06, + "loss": 0.4598, + "step": 215 + }, + { + "epoch": 0.0302945301542777, + "grad_norm": 2.7110071530587203, + "learning_rate": 9.999997936569974e-06, + "loss": 0.4757, + "step": 216 + }, + { + "epoch": 0.030434782608695653, + "grad_norm": 3.1539139364591593, + "learning_rate": 9.999995357282836e-06, + "loss": 0.4891, + "step": 217 + }, + { + "epoch": 0.030575035063113605, + "grad_norm": 3.171695046417335, + "learning_rate": 9.999991746281591e-06, + "loss": 0.4617, + "step": 218 + }, + { + "epoch": 0.030715287517531557, + "grad_norm": 7.457190086787282, + "learning_rate": 9.999987103566983e-06, + "loss": 0.4733, + "step": 219 + }, + { + "epoch": 0.030855539971949508, + "grad_norm": 3.8577441271643957, + "learning_rate": 9.999981429139967e-06, + "loss": 0.533, + "step": 220 + }, + { + "epoch": 0.03099579242636746, + "grad_norm": 4.5759186619685615, + "learning_rate": 9.999974723001716e-06, + "loss": 0.4259, + "step": 221 + }, + { + "epoch": 0.031136044880785415, + "grad_norm": 4.491693459586005, + "learning_rate": 9.999966985153615e-06, + "loss": 0.4972, + "step": 222 + }, + { + "epoch": 0.03127629733520337, + "grad_norm": 5.173661675373215, + "learning_rate": 9.999958215597257e-06, + "loss": 0.4109, + "step": 223 + }, + { + "epoch": 0.03141654978962132, + "grad_norm": 3.916440199669267, + "learning_rate": 9.999948414334455e-06, + "loss": 0.436, + "step": 224 + }, + { + "epoch": 0.03155680224403927, + "grad_norm": 6.064143950955498, + "learning_rate": 9.99993758136723e-06, + "loss": 0.4106, + "step": 225 + }, + { + "epoch": 0.031697054698457225, + "grad_norm": 6.448893418126817, + "learning_rate": 9.999925716697817e-06, + "loss": 0.4182, + "step": 226 + }, + { + "epoch": 0.03183730715287517, + "grad_norm": 4.138705287220603, + "learning_rate": 9.999912820328665e-06, + "loss": 0.4654, + "step": 227 + }, + { + "epoch": 0.03197755960729313, + "grad_norm": 4.1848981375016345, + "learning_rate": 9.999898892262433e-06, + "loss": 0.5169, + "step": 228 + }, + { + "epoch": 0.03211781206171108, + "grad_norm": 5.729900558168016, + "learning_rate": 9.999883932502e-06, + "loss": 0.4591, + "step": 229 + }, + { + "epoch": 0.03225806451612903, + "grad_norm": 5.312650431578789, + "learning_rate": 9.999867941050447e-06, + "loss": 0.4888, + "step": 230 + }, + { + "epoch": 0.032398316970546986, + "grad_norm": 3.2773287639798165, + "learning_rate": 9.999850917911077e-06, + "loss": 0.4727, + "step": 231 + }, + { + "epoch": 0.032538569424964935, + "grad_norm": 5.090892153872765, + "learning_rate": 9.999832863087403e-06, + "loss": 0.496, + "step": 232 + }, + { + "epoch": 0.03267882187938289, + "grad_norm": 2.6749013971712348, + "learning_rate": 9.999813776583148e-06, + "loss": 0.4557, + "step": 233 + }, + { + "epoch": 0.032819074333800845, + "grad_norm": 3.4653662596172223, + "learning_rate": 9.99979365840225e-06, + "loss": 0.4448, + "step": 234 + }, + { + "epoch": 0.03295932678821879, + "grad_norm": 3.0513598177977794, + "learning_rate": 9.999772508548863e-06, + "loss": 0.4645, + "step": 235 + }, + { + "epoch": 0.03309957924263675, + "grad_norm": 3.938342170872232, + "learning_rate": 9.999750327027351e-06, + "loss": 0.4441, + "step": 236 + }, + { + "epoch": 0.033239831697054696, + "grad_norm": 3.0984241356419577, + "learning_rate": 9.999727113842291e-06, + "loss": 0.4604, + "step": 237 + }, + { + "epoch": 0.03338008415147265, + "grad_norm": 3.1530727597645347, + "learning_rate": 9.999702868998469e-06, + "loss": 0.4454, + "step": 238 + }, + { + "epoch": 0.033520336605890606, + "grad_norm": 2.4716036757375632, + "learning_rate": 9.999677592500892e-06, + "loss": 0.4657, + "step": 239 + }, + { + "epoch": 0.033660589060308554, + "grad_norm": 3.000683070937668, + "learning_rate": 9.999651284354774e-06, + "loss": 0.459, + "step": 240 + }, + { + "epoch": 0.03380084151472651, + "grad_norm": 2.5906398464463103, + "learning_rate": 9.999623944565545e-06, + "loss": 0.4551, + "step": 241 + }, + { + "epoch": 0.03394109396914446, + "grad_norm": 3.4031914094132296, + "learning_rate": 9.999595573138845e-06, + "loss": 0.4755, + "step": 242 + }, + { + "epoch": 0.03408134642356241, + "grad_norm": 2.8565402962306092, + "learning_rate": 9.999566170080528e-06, + "loss": 0.4243, + "step": 243 + }, + { + "epoch": 0.03422159887798037, + "grad_norm": 3.2899238382817217, + "learning_rate": 9.999535735396662e-06, + "loss": 0.4628, + "step": 244 + }, + { + "epoch": 0.034361851332398316, + "grad_norm": 8.290443010629769, + "learning_rate": 9.999504269093525e-06, + "loss": 0.4826, + "step": 245 + }, + { + "epoch": 0.03450210378681627, + "grad_norm": 2.719656080774345, + "learning_rate": 9.999471771177612e-06, + "loss": 0.4288, + "step": 246 + }, + { + "epoch": 0.03464235624123422, + "grad_norm": 2.7756213549302475, + "learning_rate": 9.999438241655629e-06, + "loss": 0.4144, + "step": 247 + }, + { + "epoch": 0.034782608695652174, + "grad_norm": 2.4149045987764, + "learning_rate": 9.999403680534492e-06, + "loss": 0.4561, + "step": 248 + }, + { + "epoch": 0.03492286115007013, + "grad_norm": 2.7172461548891307, + "learning_rate": 9.999368087821337e-06, + "loss": 0.4623, + "step": 249 + }, + { + "epoch": 0.03506311360448808, + "grad_norm": 2.4256778516862316, + "learning_rate": 9.999331463523502e-06, + "loss": 0.4749, + "step": 250 + }, + { + "epoch": 0.03520336605890603, + "grad_norm": 3.724395569784377, + "learning_rate": 9.99929380764855e-06, + "loss": 0.4399, + "step": 251 + }, + { + "epoch": 0.03534361851332398, + "grad_norm": 3.568258617129064, + "learning_rate": 9.999255120204248e-06, + "loss": 0.4683, + "step": 252 + }, + { + "epoch": 0.035483870967741936, + "grad_norm": 2.6187052866320912, + "learning_rate": 9.999215401198579e-06, + "loss": 0.4298, + "step": 253 + }, + { + "epoch": 0.03562412342215989, + "grad_norm": 2.3684910373498034, + "learning_rate": 9.99917465063974e-06, + "loss": 0.44, + "step": 254 + }, + { + "epoch": 0.03576437587657784, + "grad_norm": 2.412211971186012, + "learning_rate": 9.999132868536139e-06, + "loss": 0.4445, + "step": 255 + }, + { + "epoch": 0.035904628330995794, + "grad_norm": 3.6754283837004547, + "learning_rate": 9.999090054896397e-06, + "loss": 0.4472, + "step": 256 + }, + { + "epoch": 0.03604488078541374, + "grad_norm": 2.1935283972817397, + "learning_rate": 9.999046209729347e-06, + "loss": 0.4419, + "step": 257 + }, + { + "epoch": 0.0361851332398317, + "grad_norm": 3.219258240508144, + "learning_rate": 9.999001333044039e-06, + "loss": 0.4845, + "step": 258 + }, + { + "epoch": 0.03632538569424965, + "grad_norm": 3.194816868030383, + "learning_rate": 9.998955424849733e-06, + "loss": 0.462, + "step": 259 + }, + { + "epoch": 0.0364656381486676, + "grad_norm": 3.0603513381318126, + "learning_rate": 9.998908485155898e-06, + "loss": 0.4309, + "step": 260 + }, + { + "epoch": 0.036605890603085556, + "grad_norm": 3.412580434668683, + "learning_rate": 9.998860513972224e-06, + "loss": 0.4182, + "step": 261 + }, + { + "epoch": 0.036746143057503504, + "grad_norm": 2.1959985495946435, + "learning_rate": 9.998811511308608e-06, + "loss": 0.4187, + "step": 262 + }, + { + "epoch": 0.03688639551192146, + "grad_norm": 3.1898764932574313, + "learning_rate": 9.99876147717516e-06, + "loss": 0.4299, + "step": 263 + }, + { + "epoch": 0.037026647966339414, + "grad_norm": 2.5861625458214, + "learning_rate": 9.998710411582205e-06, + "loss": 0.4583, + "step": 264 + }, + { + "epoch": 0.03716690042075736, + "grad_norm": 2.761361925957318, + "learning_rate": 9.998658314540282e-06, + "loss": 0.4413, + "step": 265 + }, + { + "epoch": 0.03730715287517532, + "grad_norm": 2.5023223580676617, + "learning_rate": 9.998605186060138e-06, + "loss": 0.4774, + "step": 266 + }, + { + "epoch": 0.037447405329593265, + "grad_norm": 2.594543427596996, + "learning_rate": 9.998551026152736e-06, + "loss": 0.5158, + "step": 267 + }, + { + "epoch": 0.03758765778401122, + "grad_norm": 2.1353457431335343, + "learning_rate": 9.998495834829255e-06, + "loss": 0.4678, + "step": 268 + }, + { + "epoch": 0.037727910238429176, + "grad_norm": 1.9760431504499556, + "learning_rate": 9.998439612101079e-06, + "loss": 0.4884, + "step": 269 + }, + { + "epoch": 0.037868162692847124, + "grad_norm": 2.299898609746157, + "learning_rate": 9.99838235797981e-06, + "loss": 0.4694, + "step": 270 + }, + { + "epoch": 0.03800841514726508, + "grad_norm": 2.877641210993647, + "learning_rate": 9.998324072477266e-06, + "loss": 0.466, + "step": 271 + }, + { + "epoch": 0.03814866760168303, + "grad_norm": 2.4696916160660214, + "learning_rate": 9.998264755605467e-06, + "loss": 0.4104, + "step": 272 + }, + { + "epoch": 0.03828892005610098, + "grad_norm": 3.0000419613764624, + "learning_rate": 9.99820440737666e-06, + "loss": 0.5101, + "step": 273 + }, + { + "epoch": 0.03842917251051894, + "grad_norm": 2.571668922075563, + "learning_rate": 9.998143027803292e-06, + "loss": 0.3905, + "step": 274 + }, + { + "epoch": 0.038569424964936885, + "grad_norm": 2.7421483746207205, + "learning_rate": 9.998080616898028e-06, + "loss": 0.48, + "step": 275 + }, + { + "epoch": 0.03870967741935484, + "grad_norm": 2.3289613181702276, + "learning_rate": 9.998017174673752e-06, + "loss": 0.4387, + "step": 276 + }, + { + "epoch": 0.03884992987377279, + "grad_norm": 2.272201883970651, + "learning_rate": 9.997952701143547e-06, + "loss": 0.4177, + "step": 277 + }, + { + "epoch": 0.038990182328190744, + "grad_norm": 2.290366676825513, + "learning_rate": 9.997887196320723e-06, + "loss": 0.4676, + "step": 278 + }, + { + "epoch": 0.0391304347826087, + "grad_norm": 2.7377494698340126, + "learning_rate": 9.997820660218793e-06, + "loss": 0.4464, + "step": 279 + }, + { + "epoch": 0.03927068723702665, + "grad_norm": 5.596344756243784, + "learning_rate": 9.997753092851488e-06, + "loss": 0.4646, + "step": 280 + }, + { + "epoch": 0.0394109396914446, + "grad_norm": 2.338321145810563, + "learning_rate": 9.99768449423275e-06, + "loss": 0.4701, + "step": 281 + }, + { + "epoch": 0.03955119214586255, + "grad_norm": 2.1076395313053173, + "learning_rate": 9.997614864376732e-06, + "loss": 0.4388, + "step": 282 + }, + { + "epoch": 0.039691444600280505, + "grad_norm": 2.999200078494238, + "learning_rate": 9.997544203297801e-06, + "loss": 0.4057, + "step": 283 + }, + { + "epoch": 0.03983169705469846, + "grad_norm": 2.8111029652044843, + "learning_rate": 9.997472511010543e-06, + "loss": 0.4448, + "step": 284 + }, + { + "epoch": 0.03997194950911641, + "grad_norm": 2.41178396400633, + "learning_rate": 9.997399787529744e-06, + "loss": 0.4118, + "step": 285 + }, + { + "epoch": 0.040112201963534364, + "grad_norm": 2.775796923605218, + "learning_rate": 9.997326032870417e-06, + "loss": 0.424, + "step": 286 + }, + { + "epoch": 0.04025245441795231, + "grad_norm": 2.27484328600029, + "learning_rate": 9.997251247047775e-06, + "loss": 0.4841, + "step": 287 + }, + { + "epoch": 0.04039270687237027, + "grad_norm": 3.7872507699478013, + "learning_rate": 9.997175430077253e-06, + "loss": 0.4039, + "step": 288 + }, + { + "epoch": 0.04053295932678822, + "grad_norm": 2.4579956878860183, + "learning_rate": 9.997098581974492e-06, + "loss": 0.4481, + "step": 289 + }, + { + "epoch": 0.04067321178120617, + "grad_norm": 7.967051844948522, + "learning_rate": 9.997020702755353e-06, + "loss": 0.4396, + "step": 290 + }, + { + "epoch": 0.040813464235624125, + "grad_norm": 2.7912498466604405, + "learning_rate": 9.996941792435903e-06, + "loss": 0.4852, + "step": 291 + }, + { + "epoch": 0.04095371669004207, + "grad_norm": 2.4139024996279286, + "learning_rate": 9.996861851032426e-06, + "loss": 0.4419, + "step": 292 + }, + { + "epoch": 0.04109396914446003, + "grad_norm": 3.2394699289439663, + "learning_rate": 9.996780878561417e-06, + "loss": 0.4403, + "step": 293 + }, + { + "epoch": 0.04123422159887798, + "grad_norm": 2.7592036815093386, + "learning_rate": 9.996698875039583e-06, + "loss": 0.4903, + "step": 294 + }, + { + "epoch": 0.04137447405329593, + "grad_norm": 2.3936474212898418, + "learning_rate": 9.996615840483847e-06, + "loss": 0.4097, + "step": 295 + }, + { + "epoch": 0.04151472650771389, + "grad_norm": 3.5436822578642895, + "learning_rate": 9.99653177491134e-06, + "loss": 0.4876, + "step": 296 + }, + { + "epoch": 0.041654978962131835, + "grad_norm": 3.959402535575374, + "learning_rate": 9.996446678339413e-06, + "loss": 0.425, + "step": 297 + }, + { + "epoch": 0.04179523141654979, + "grad_norm": 2.6463237418097703, + "learning_rate": 9.996360550785619e-06, + "loss": 0.4225, + "step": 298 + }, + { + "epoch": 0.041935483870967745, + "grad_norm": 4.9228909851687215, + "learning_rate": 9.996273392267733e-06, + "loss": 0.4407, + "step": 299 + }, + { + "epoch": 0.04207573632538569, + "grad_norm": 2.655592174978057, + "learning_rate": 9.99618520280374e-06, + "loss": 0.4416, + "step": 300 + }, + { + "epoch": 0.04221598877980365, + "grad_norm": 2.1464593832072727, + "learning_rate": 9.996095982411835e-06, + "loss": 0.4552, + "step": 301 + }, + { + "epoch": 0.042356241234221596, + "grad_norm": 2.299876840030588, + "learning_rate": 9.996005731110431e-06, + "loss": 0.4766, + "step": 302 + }, + { + "epoch": 0.04249649368863955, + "grad_norm": 2.182841326578513, + "learning_rate": 9.99591444891815e-06, + "loss": 0.427, + "step": 303 + }, + { + "epoch": 0.042636746143057506, + "grad_norm": 1.9763327001342343, + "learning_rate": 9.995822135853824e-06, + "loss": 0.437, + "step": 304 + }, + { + "epoch": 0.042776998597475455, + "grad_norm": 2.4374035547200656, + "learning_rate": 9.995728791936505e-06, + "loss": 0.4357, + "step": 305 + }, + { + "epoch": 0.04291725105189341, + "grad_norm": 3.335041577996492, + "learning_rate": 9.995634417185454e-06, + "loss": 0.4545, + "step": 306 + }, + { + "epoch": 0.04305750350631136, + "grad_norm": 2.791786893466748, + "learning_rate": 9.995539011620143e-06, + "loss": 0.4411, + "step": 307 + }, + { + "epoch": 0.04319775596072931, + "grad_norm": 2.2639917208584217, + "learning_rate": 9.995442575260257e-06, + "loss": 0.4121, + "step": 308 + }, + { + "epoch": 0.04333800841514727, + "grad_norm": 2.828405292594402, + "learning_rate": 9.995345108125698e-06, + "loss": 0.4585, + "step": 309 + }, + { + "epoch": 0.043478260869565216, + "grad_norm": 2.2326249958856543, + "learning_rate": 9.995246610236575e-06, + "loss": 0.4493, + "step": 310 + }, + { + "epoch": 0.04361851332398317, + "grad_norm": 2.260294464136413, + "learning_rate": 9.995147081613214e-06, + "loss": 0.4859, + "step": 311 + }, + { + "epoch": 0.04375876577840112, + "grad_norm": 2.1496741602412075, + "learning_rate": 9.995046522276152e-06, + "loss": 0.4799, + "step": 312 + }, + { + "epoch": 0.043899018232819075, + "grad_norm": 2.4892454569806644, + "learning_rate": 9.994944932246137e-06, + "loss": 0.4069, + "step": 313 + }, + { + "epoch": 0.04403927068723703, + "grad_norm": 3.165611509574923, + "learning_rate": 9.994842311544135e-06, + "loss": 0.471, + "step": 314 + }, + { + "epoch": 0.04417952314165498, + "grad_norm": 2.415118927643495, + "learning_rate": 9.994738660191316e-06, + "loss": 0.4696, + "step": 315 + }, + { + "epoch": 0.04431977559607293, + "grad_norm": 2.2355811858156267, + "learning_rate": 9.994633978209073e-06, + "loss": 0.4269, + "step": 316 + }, + { + "epoch": 0.04446002805049088, + "grad_norm": 2.7164345012135938, + "learning_rate": 9.994528265619004e-06, + "loss": 0.4284, + "step": 317 + }, + { + "epoch": 0.044600280504908836, + "grad_norm": 2.140912001923874, + "learning_rate": 9.99442152244292e-06, + "loss": 0.395, + "step": 318 + }, + { + "epoch": 0.04474053295932679, + "grad_norm": 2.5090098152135902, + "learning_rate": 9.994313748702848e-06, + "loss": 0.4667, + "step": 319 + }, + { + "epoch": 0.04488078541374474, + "grad_norm": 2.202827379401287, + "learning_rate": 9.994204944421029e-06, + "loss": 0.5083, + "step": 320 + }, + { + "epoch": 0.045021037868162694, + "grad_norm": 2.282237218163507, + "learning_rate": 9.994095109619912e-06, + "loss": 0.4951, + "step": 321 + }, + { + "epoch": 0.04516129032258064, + "grad_norm": 3.9044745721114813, + "learning_rate": 9.993984244322158e-06, + "loss": 0.4194, + "step": 322 + }, + { + "epoch": 0.0453015427769986, + "grad_norm": 2.0614550862440146, + "learning_rate": 9.993872348550649e-06, + "loss": 0.4627, + "step": 323 + }, + { + "epoch": 0.04544179523141655, + "grad_norm": 2.3684085932063708, + "learning_rate": 9.993759422328471e-06, + "loss": 0.3941, + "step": 324 + }, + { + "epoch": 0.0455820476858345, + "grad_norm": 2.243267902234387, + "learning_rate": 9.993645465678923e-06, + "loss": 0.459, + "step": 325 + }, + { + "epoch": 0.045722300140252456, + "grad_norm": 2.33580399453878, + "learning_rate": 9.993530478625524e-06, + "loss": 0.4344, + "step": 326 + }, + { + "epoch": 0.045862552594670404, + "grad_norm": 1.9560050786660381, + "learning_rate": 9.993414461191996e-06, + "loss": 0.3959, + "step": 327 + }, + { + "epoch": 0.04600280504908836, + "grad_norm": 2.192379767536861, + "learning_rate": 9.993297413402282e-06, + "loss": 0.4596, + "step": 328 + }, + { + "epoch": 0.046143057503506314, + "grad_norm": 2.3848361068170227, + "learning_rate": 9.993179335280532e-06, + "loss": 0.4794, + "step": 329 + }, + { + "epoch": 0.04628330995792426, + "grad_norm": 2.2372163432947922, + "learning_rate": 9.993060226851112e-06, + "loss": 0.4718, + "step": 330 + }, + { + "epoch": 0.04642356241234222, + "grad_norm": 2.0827652601764326, + "learning_rate": 9.992940088138598e-06, + "loss": 0.4325, + "step": 331 + }, + { + "epoch": 0.046563814866760166, + "grad_norm": 2.0082875443088875, + "learning_rate": 9.992818919167779e-06, + "loss": 0.4598, + "step": 332 + }, + { + "epoch": 0.04670406732117812, + "grad_norm": 2.675181434396497, + "learning_rate": 9.992696719963662e-06, + "loss": 0.4272, + "step": 333 + }, + { + "epoch": 0.046844319775596076, + "grad_norm": 2.6326097359386624, + "learning_rate": 9.992573490551457e-06, + "loss": 0.4285, + "step": 334 + }, + { + "epoch": 0.046984572230014024, + "grad_norm": 2.264642436622871, + "learning_rate": 9.992449230956591e-06, + "loss": 0.449, + "step": 335 + }, + { + "epoch": 0.04712482468443198, + "grad_norm": 2.739256158784037, + "learning_rate": 9.992323941204709e-06, + "loss": 0.444, + "step": 336 + }, + { + "epoch": 0.04726507713884993, + "grad_norm": 2.200321442789467, + "learning_rate": 9.99219762132166e-06, + "loss": 0.4697, + "step": 337 + }, + { + "epoch": 0.04740532959326788, + "grad_norm": 2.7014323320684666, + "learning_rate": 9.992070271333508e-06, + "loss": 0.4886, + "step": 338 + }, + { + "epoch": 0.04754558204768584, + "grad_norm": 1.9649803766333644, + "learning_rate": 9.991941891266535e-06, + "loss": 0.4282, + "step": 339 + }, + { + "epoch": 0.047685834502103785, + "grad_norm": 2.1970225559998844, + "learning_rate": 9.99181248114723e-06, + "loss": 0.4428, + "step": 340 + }, + { + "epoch": 0.04782608695652174, + "grad_norm": 2.280902679340636, + "learning_rate": 9.991682041002294e-06, + "loss": 0.4978, + "step": 341 + }, + { + "epoch": 0.04796633941093969, + "grad_norm": 1.8243627062569885, + "learning_rate": 9.991550570858642e-06, + "loss": 0.4087, + "step": 342 + }, + { + "epoch": 0.048106591865357644, + "grad_norm": 2.146800468294943, + "learning_rate": 9.991418070743406e-06, + "loss": 0.3955, + "step": 343 + }, + { + "epoch": 0.0482468443197756, + "grad_norm": 1.9664065021913326, + "learning_rate": 9.991284540683922e-06, + "loss": 0.4296, + "step": 344 + }, + { + "epoch": 0.04838709677419355, + "grad_norm": 2.204615528383741, + "learning_rate": 9.991149980707746e-06, + "loss": 0.4565, + "step": 345 + }, + { + "epoch": 0.0485273492286115, + "grad_norm": 1.9048901069162731, + "learning_rate": 9.991014390842642e-06, + "loss": 0.481, + "step": 346 + }, + { + "epoch": 0.04866760168302945, + "grad_norm": 2.147333700826464, + "learning_rate": 9.990877771116588e-06, + "loss": 0.4565, + "step": 347 + }, + { + "epoch": 0.048807854137447405, + "grad_norm": 3.100564099332543, + "learning_rate": 9.990740121557776e-06, + "loss": 0.401, + "step": 348 + }, + { + "epoch": 0.04894810659186536, + "grad_norm": 2.05657489378133, + "learning_rate": 9.990601442194607e-06, + "loss": 0.4256, + "step": 349 + }, + { + "epoch": 0.04908835904628331, + "grad_norm": 2.9890188466228027, + "learning_rate": 9.990461733055698e-06, + "loss": 0.4117, + "step": 350 + }, + { + "epoch": 0.049228611500701264, + "grad_norm": 2.9191849509046, + "learning_rate": 9.990320994169876e-06, + "loss": 0.4441, + "step": 351 + }, + { + "epoch": 0.04936886395511921, + "grad_norm": 2.4351511666688803, + "learning_rate": 9.990179225566184e-06, + "loss": 0.4408, + "step": 352 + }, + { + "epoch": 0.04950911640953717, + "grad_norm": 2.3557971832916627, + "learning_rate": 9.99003642727387e-06, + "loss": 0.4232, + "step": 353 + }, + { + "epoch": 0.04964936886395512, + "grad_norm": 2.081023639821482, + "learning_rate": 9.989892599322404e-06, + "loss": 0.4079, + "step": 354 + }, + { + "epoch": 0.04978962131837307, + "grad_norm": 2.04506142913407, + "learning_rate": 9.989747741741462e-06, + "loss": 0.4199, + "step": 355 + }, + { + "epoch": 0.049929873772791025, + "grad_norm": 2.338372839686613, + "learning_rate": 9.989601854560935e-06, + "loss": 0.4641, + "step": 356 + }, + { + "epoch": 0.05007012622720897, + "grad_norm": 3.097644243060746, + "learning_rate": 9.989454937810926e-06, + "loss": 0.4067, + "step": 357 + }, + { + "epoch": 0.05021037868162693, + "grad_norm": 2.418508047377956, + "learning_rate": 9.989306991521748e-06, + "loss": 0.4857, + "step": 358 + }, + { + "epoch": 0.050350631136044884, + "grad_norm": 2.4960379676376463, + "learning_rate": 9.989158015723933e-06, + "loss": 0.3945, + "step": 359 + }, + { + "epoch": 0.05049088359046283, + "grad_norm": 3.0467826633644743, + "learning_rate": 9.989008010448216e-06, + "loss": 0.4802, + "step": 360 + }, + { + "epoch": 0.05063113604488079, + "grad_norm": 2.9041844072351926, + "learning_rate": 9.988856975725551e-06, + "loss": 0.4073, + "step": 361 + }, + { + "epoch": 0.050771388499298735, + "grad_norm": 2.8008760784700564, + "learning_rate": 9.988704911587106e-06, + "loss": 0.4451, + "step": 362 + }, + { + "epoch": 0.05091164095371669, + "grad_norm": 2.1058253055364644, + "learning_rate": 9.988551818064257e-06, + "loss": 0.4193, + "step": 363 + }, + { + "epoch": 0.051051893408134645, + "grad_norm": 4.016680983099743, + "learning_rate": 9.988397695188592e-06, + "loss": 0.443, + "step": 364 + }, + { + "epoch": 0.05119214586255259, + "grad_norm": 2.240242037639467, + "learning_rate": 9.988242542991914e-06, + "loss": 0.4573, + "step": 365 + }, + { + "epoch": 0.05133239831697055, + "grad_norm": 3.900217143151508, + "learning_rate": 9.98808636150624e-06, + "loss": 0.4569, + "step": 366 + }, + { + "epoch": 0.051472650771388496, + "grad_norm": 4.681467676131186, + "learning_rate": 9.987929150763791e-06, + "loss": 0.4644, + "step": 367 + }, + { + "epoch": 0.05161290322580645, + "grad_norm": 2.497936159831502, + "learning_rate": 9.987770910797014e-06, + "loss": 0.4262, + "step": 368 + }, + { + "epoch": 0.05175315568022441, + "grad_norm": 2.5257348152798422, + "learning_rate": 9.987611641638555e-06, + "loss": 0.4294, + "step": 369 + }, + { + "epoch": 0.051893408134642355, + "grad_norm": 3.3062575918878454, + "learning_rate": 9.98745134332128e-06, + "loss": 0.4351, + "step": 370 + }, + { + "epoch": 0.05203366058906031, + "grad_norm": 2.6053801697367356, + "learning_rate": 9.987290015878266e-06, + "loss": 0.4043, + "step": 371 + }, + { + "epoch": 0.05217391304347826, + "grad_norm": 2.7186338027151398, + "learning_rate": 9.9871276593428e-06, + "loss": 0.4769, + "step": 372 + }, + { + "epoch": 0.05231416549789621, + "grad_norm": 3.154600259935923, + "learning_rate": 9.986964273748385e-06, + "loss": 0.4398, + "step": 373 + }, + { + "epoch": 0.05245441795231417, + "grad_norm": 2.75939818012874, + "learning_rate": 9.986799859128735e-06, + "loss": 0.3672, + "step": 374 + }, + { + "epoch": 0.052594670406732116, + "grad_norm": 3.224108337626685, + "learning_rate": 9.986634415517774e-06, + "loss": 0.4438, + "step": 375 + }, + { + "epoch": 0.05273492286115007, + "grad_norm": 2.780420586859395, + "learning_rate": 9.98646794294964e-06, + "loss": 0.4126, + "step": 376 + }, + { + "epoch": 0.05287517531556802, + "grad_norm": 2.452331997933473, + "learning_rate": 9.986300441458683e-06, + "loss": 0.4359, + "step": 377 + }, + { + "epoch": 0.053015427769985975, + "grad_norm": 3.4553352290486927, + "learning_rate": 9.986131911079469e-06, + "loss": 0.4799, + "step": 378 + }, + { + "epoch": 0.05315568022440393, + "grad_norm": 2.540674255569296, + "learning_rate": 9.98596235184677e-06, + "loss": 0.4535, + "step": 379 + }, + { + "epoch": 0.05329593267882188, + "grad_norm": 4.561415033802088, + "learning_rate": 9.985791763795576e-06, + "loss": 0.4403, + "step": 380 + }, + { + "epoch": 0.05343618513323983, + "grad_norm": 3.5456563729858837, + "learning_rate": 9.985620146961083e-06, + "loss": 0.4166, + "step": 381 + }, + { + "epoch": 0.05357643758765778, + "grad_norm": 3.056757623060372, + "learning_rate": 9.985447501378706e-06, + "loss": 0.4446, + "step": 382 + }, + { + "epoch": 0.053716690042075736, + "grad_norm": 2.65376111280259, + "learning_rate": 9.985273827084068e-06, + "loss": 0.4174, + "step": 383 + }, + { + "epoch": 0.05385694249649369, + "grad_norm": 2.2571335137882653, + "learning_rate": 9.985099124113006e-06, + "loss": 0.4569, + "step": 384 + }, + { + "epoch": 0.05399719495091164, + "grad_norm": 3.140464816587059, + "learning_rate": 9.984923392501567e-06, + "loss": 0.405, + "step": 385 + }, + { + "epoch": 0.054137447405329595, + "grad_norm": 2.978348163837495, + "learning_rate": 9.984746632286016e-06, + "loss": 0.4393, + "step": 386 + }, + { + "epoch": 0.05427769985974754, + "grad_norm": 2.911406833327405, + "learning_rate": 9.984568843502822e-06, + "loss": 0.4244, + "step": 387 + }, + { + "epoch": 0.0544179523141655, + "grad_norm": 3.448299574934883, + "learning_rate": 9.984390026188671e-06, + "loss": 0.4606, + "step": 388 + }, + { + "epoch": 0.05455820476858345, + "grad_norm": 3.858196866373252, + "learning_rate": 9.984210180380464e-06, + "loss": 0.4029, + "step": 389 + }, + { + "epoch": 0.0546984572230014, + "grad_norm": 2.456964197744901, + "learning_rate": 9.984029306115307e-06, + "loss": 0.4547, + "step": 390 + }, + { + "epoch": 0.054838709677419356, + "grad_norm": 2.7040302326442203, + "learning_rate": 9.983847403430525e-06, + "loss": 0.3927, + "step": 391 + }, + { + "epoch": 0.054978962131837304, + "grad_norm": 3.113182611949032, + "learning_rate": 9.98366447236365e-06, + "loss": 0.4277, + "step": 392 + }, + { + "epoch": 0.05511921458625526, + "grad_norm": 5.221665350439687, + "learning_rate": 9.983480512952432e-06, + "loss": 0.4287, + "step": 393 + }, + { + "epoch": 0.055259467040673214, + "grad_norm": 4.395863079434432, + "learning_rate": 9.983295525234827e-06, + "loss": 0.4268, + "step": 394 + }, + { + "epoch": 0.05539971949509116, + "grad_norm": 2.525984762802741, + "learning_rate": 9.983109509249004e-06, + "loss": 0.4171, + "step": 395 + }, + { + "epoch": 0.05553997194950912, + "grad_norm": 2.3485680458195644, + "learning_rate": 9.98292246503335e-06, + "loss": 0.4277, + "step": 396 + }, + { + "epoch": 0.055680224403927066, + "grad_norm": 2.6902959606908716, + "learning_rate": 9.98273439262646e-06, + "loss": 0.4234, + "step": 397 + }, + { + "epoch": 0.05582047685834502, + "grad_norm": 3.229392521660995, + "learning_rate": 9.982545292067138e-06, + "loss": 0.3998, + "step": 398 + }, + { + "epoch": 0.055960729312762976, + "grad_norm": 3.3481118816882818, + "learning_rate": 9.982355163394407e-06, + "loss": 0.4255, + "step": 399 + }, + { + "epoch": 0.056100981767180924, + "grad_norm": 4.178073122077656, + "learning_rate": 9.982164006647497e-06, + "loss": 0.4778, + "step": 400 + }, + { + "epoch": 0.05624123422159888, + "grad_norm": 2.7662857042327764, + "learning_rate": 9.981971821865853e-06, + "loss": 0.376, + "step": 401 + }, + { + "epoch": 0.05638148667601683, + "grad_norm": 3.1243894362035056, + "learning_rate": 9.98177860908913e-06, + "loss": 0.4587, + "step": 402 + }, + { + "epoch": 0.05652173913043478, + "grad_norm": 2.8668578579450243, + "learning_rate": 9.981584368357198e-06, + "loss": 0.4969, + "step": 403 + }, + { + "epoch": 0.05666199158485274, + "grad_norm": 2.9526239202142204, + "learning_rate": 9.981389099710132e-06, + "loss": 0.4319, + "step": 404 + }, + { + "epoch": 0.056802244039270686, + "grad_norm": 3.4498605617042895, + "learning_rate": 9.981192803188232e-06, + "loss": 0.4591, + "step": 405 + }, + { + "epoch": 0.05694249649368864, + "grad_norm": 2.4476655114294243, + "learning_rate": 9.980995478831997e-06, + "loss": 0.4806, + "step": 406 + }, + { + "epoch": 0.05708274894810659, + "grad_norm": 2.60632731005128, + "learning_rate": 9.980797126682145e-06, + "loss": 0.4234, + "step": 407 + }, + { + "epoch": 0.057223001402524544, + "grad_norm": 2.547888057004348, + "learning_rate": 9.980597746779604e-06, + "loss": 0.479, + "step": 408 + }, + { + "epoch": 0.0573632538569425, + "grad_norm": 2.9135041302765767, + "learning_rate": 9.980397339165514e-06, + "loss": 0.4603, + "step": 409 + }, + { + "epoch": 0.05750350631136045, + "grad_norm": 2.4318942630958817, + "learning_rate": 9.980195903881231e-06, + "loss": 0.3862, + "step": 410 + }, + { + "epoch": 0.0576437587657784, + "grad_norm": 4.859069225834828, + "learning_rate": 9.979993440968317e-06, + "loss": 0.4348, + "step": 411 + }, + { + "epoch": 0.05778401122019635, + "grad_norm": 3.1546327583358766, + "learning_rate": 9.979789950468549e-06, + "loss": 0.3903, + "step": 412 + }, + { + "epoch": 0.057924263674614306, + "grad_norm": 2.3941324471315197, + "learning_rate": 9.979585432423917e-06, + "loss": 0.4625, + "step": 413 + }, + { + "epoch": 0.05806451612903226, + "grad_norm": 2.1689398163060707, + "learning_rate": 9.97937988687662e-06, + "loss": 0.3648, + "step": 414 + }, + { + "epoch": 0.05820476858345021, + "grad_norm": 2.4445118918891358, + "learning_rate": 9.979173313869072e-06, + "loss": 0.4376, + "step": 415 + }, + { + "epoch": 0.058345021037868164, + "grad_norm": 2.431349207562791, + "learning_rate": 9.9789657134439e-06, + "loss": 0.4026, + "step": 416 + }, + { + "epoch": 0.05848527349228611, + "grad_norm": 2.2644796699256524, + "learning_rate": 9.978757085643937e-06, + "loss": 0.4361, + "step": 417 + }, + { + "epoch": 0.05862552594670407, + "grad_norm": 2.9941974791263095, + "learning_rate": 9.978547430512235e-06, + "loss": 0.4146, + "step": 418 + }, + { + "epoch": 0.05876577840112202, + "grad_norm": 3.2043005298155127, + "learning_rate": 9.978336748092053e-06, + "loss": 0.4846, + "step": 419 + }, + { + "epoch": 0.05890603085553997, + "grad_norm": 2.892740001695145, + "learning_rate": 9.978125038426865e-06, + "loss": 0.4259, + "step": 420 + }, + { + "epoch": 0.059046283309957925, + "grad_norm": 2.875252754047582, + "learning_rate": 9.977912301560354e-06, + "loss": 0.4322, + "step": 421 + }, + { + "epoch": 0.059186535764375874, + "grad_norm": 3.3902067286830766, + "learning_rate": 9.97769853753642e-06, + "loss": 0.4289, + "step": 422 + }, + { + "epoch": 0.05932678821879383, + "grad_norm": 2.283201558656702, + "learning_rate": 9.977483746399168e-06, + "loss": 0.4062, + "step": 423 + }, + { + "epoch": 0.059467040673211784, + "grad_norm": 2.9137736732179103, + "learning_rate": 9.97726792819292e-06, + "loss": 0.4466, + "step": 424 + }, + { + "epoch": 0.05960729312762973, + "grad_norm": 3.0026838695119156, + "learning_rate": 9.97705108296221e-06, + "loss": 0.4266, + "step": 425 + }, + { + "epoch": 0.05974754558204769, + "grad_norm": 2.271799342851588, + "learning_rate": 9.976833210751781e-06, + "loss": 0.4098, + "step": 426 + }, + { + "epoch": 0.059887798036465635, + "grad_norm": 2.2688546148723887, + "learning_rate": 9.97661431160659e-06, + "loss": 0.4412, + "step": 427 + }, + { + "epoch": 0.06002805049088359, + "grad_norm": 2.2506609051993753, + "learning_rate": 9.976394385571805e-06, + "loss": 0.4229, + "step": 428 + }, + { + "epoch": 0.060168302945301545, + "grad_norm": 3.4311202061186155, + "learning_rate": 9.976173432692806e-06, + "loss": 0.3871, + "step": 429 + }, + { + "epoch": 0.06030855539971949, + "grad_norm": 2.4693623036970833, + "learning_rate": 9.975951453015187e-06, + "loss": 0.4449, + "step": 430 + }, + { + "epoch": 0.06044880785413745, + "grad_norm": 2.119309941572607, + "learning_rate": 9.975728446584748e-06, + "loss": 0.4375, + "step": 431 + }, + { + "epoch": 0.0605890603085554, + "grad_norm": 2.781737467019255, + "learning_rate": 9.97550441344751e-06, + "loss": 0.4723, + "step": 432 + }, + { + "epoch": 0.06072931276297335, + "grad_norm": 3.916772205202428, + "learning_rate": 9.975279353649696e-06, + "loss": 0.3807, + "step": 433 + }, + { + "epoch": 0.06086956521739131, + "grad_norm": 2.697268807135161, + "learning_rate": 9.975053267237748e-06, + "loss": 0.4048, + "step": 434 + }, + { + "epoch": 0.061009817671809255, + "grad_norm": 2.2991946178625473, + "learning_rate": 9.974826154258318e-06, + "loss": 0.4444, + "step": 435 + }, + { + "epoch": 0.06115007012622721, + "grad_norm": 2.288303913539505, + "learning_rate": 9.974598014758267e-06, + "loss": 0.4321, + "step": 436 + }, + { + "epoch": 0.06129032258064516, + "grad_norm": 2.1306842623609916, + "learning_rate": 9.97436884878467e-06, + "loss": 0.4538, + "step": 437 + }, + { + "epoch": 0.06143057503506311, + "grad_norm": 2.8779805531325233, + "learning_rate": 9.974138656384815e-06, + "loss": 0.4544, + "step": 438 + }, + { + "epoch": 0.06157082748948107, + "grad_norm": 2.6811096154655076, + "learning_rate": 9.973907437606201e-06, + "loss": 0.4559, + "step": 439 + }, + { + "epoch": 0.061711079943899017, + "grad_norm": 2.527840755595699, + "learning_rate": 9.973675192496539e-06, + "loss": 0.3951, + "step": 440 + }, + { + "epoch": 0.06185133239831697, + "grad_norm": 2.61175465149756, + "learning_rate": 9.973441921103748e-06, + "loss": 0.4135, + "step": 441 + }, + { + "epoch": 0.06199158485273492, + "grad_norm": 3.0474108102591577, + "learning_rate": 9.973207623475964e-06, + "loss": 0.4642, + "step": 442 + }, + { + "epoch": 0.062131837307152875, + "grad_norm": 3.273171789633585, + "learning_rate": 9.972972299661533e-06, + "loss": 0.405, + "step": 443 + }, + { + "epoch": 0.06227208976157083, + "grad_norm": 2.4906402855455507, + "learning_rate": 9.972735949709012e-06, + "loss": 0.4952, + "step": 444 + }, + { + "epoch": 0.06241234221598878, + "grad_norm": 2.752149781997046, + "learning_rate": 9.97249857366717e-06, + "loss": 0.3933, + "step": 445 + }, + { + "epoch": 0.06255259467040673, + "grad_norm": 5.226205582684293, + "learning_rate": 9.972260171584987e-06, + "loss": 0.4855, + "step": 446 + }, + { + "epoch": 0.06269284712482469, + "grad_norm": 2.138832399797569, + "learning_rate": 9.972020743511657e-06, + "loss": 0.4183, + "step": 447 + }, + { + "epoch": 0.06283309957924264, + "grad_norm": 2.7575107657947813, + "learning_rate": 9.971780289496585e-06, + "loss": 0.4044, + "step": 448 + }, + { + "epoch": 0.06297335203366058, + "grad_norm": 2.551724830411371, + "learning_rate": 9.971538809589385e-06, + "loss": 0.4481, + "step": 449 + }, + { + "epoch": 0.06311360448807854, + "grad_norm": 2.798675383867582, + "learning_rate": 9.971296303839884e-06, + "loss": 0.4512, + "step": 450 + }, + { + "epoch": 0.0632538569424965, + "grad_norm": 3.428840550010312, + "learning_rate": 9.971052772298125e-06, + "loss": 0.4238, + "step": 451 + }, + { + "epoch": 0.06339410939691445, + "grad_norm": 3.709139418412272, + "learning_rate": 9.970808215014357e-06, + "loss": 0.3889, + "step": 452 + }, + { + "epoch": 0.0635343618513324, + "grad_norm": 3.030208133309783, + "learning_rate": 9.970562632039043e-06, + "loss": 0.3957, + "step": 453 + }, + { + "epoch": 0.06367461430575035, + "grad_norm": 6.885326883150995, + "learning_rate": 9.970316023422855e-06, + "loss": 0.4285, + "step": 454 + }, + { + "epoch": 0.0638148667601683, + "grad_norm": 2.5276179688162843, + "learning_rate": 9.970068389216681e-06, + "loss": 0.4407, + "step": 455 + }, + { + "epoch": 0.06395511921458626, + "grad_norm": 3.072987718353407, + "learning_rate": 9.969819729471621e-06, + "loss": 0.4541, + "step": 456 + }, + { + "epoch": 0.06409537166900421, + "grad_norm": 3.3686920840463737, + "learning_rate": 9.96957004423898e-06, + "loss": 0.3829, + "step": 457 + }, + { + "epoch": 0.06423562412342217, + "grad_norm": 4.102687485227266, + "learning_rate": 9.96931933357028e-06, + "loss": 0.3965, + "step": 458 + }, + { + "epoch": 0.06437587657784011, + "grad_norm": 4.748896822064187, + "learning_rate": 9.969067597517255e-06, + "loss": 0.3776, + "step": 459 + }, + { + "epoch": 0.06451612903225806, + "grad_norm": 7.97001488314475, + "learning_rate": 9.968814836131849e-06, + "loss": 0.4132, + "step": 460 + }, + { + "epoch": 0.06465638148667602, + "grad_norm": 2.586267456025277, + "learning_rate": 9.968561049466214e-06, + "loss": 0.3967, + "step": 461 + }, + { + "epoch": 0.06479663394109397, + "grad_norm": 3.3607247413390375, + "learning_rate": 9.96830623757272e-06, + "loss": 0.4624, + "step": 462 + }, + { + "epoch": 0.06493688639551193, + "grad_norm": 2.81145076253653, + "learning_rate": 9.968050400503946e-06, + "loss": 0.4169, + "step": 463 + }, + { + "epoch": 0.06507713884992987, + "grad_norm": 2.3608408031441948, + "learning_rate": 9.967793538312683e-06, + "loss": 0.3983, + "step": 464 + }, + { + "epoch": 0.06521739130434782, + "grad_norm": 3.6264460080234047, + "learning_rate": 9.967535651051927e-06, + "loss": 0.4062, + "step": 465 + }, + { + "epoch": 0.06535764375876578, + "grad_norm": 3.058153922269525, + "learning_rate": 9.967276738774897e-06, + "loss": 0.4581, + "step": 466 + }, + { + "epoch": 0.06549789621318373, + "grad_norm": 2.6000922920279175, + "learning_rate": 9.967016801535018e-06, + "loss": 0.4484, + "step": 467 + }, + { + "epoch": 0.06563814866760169, + "grad_norm": 4.530200073674399, + "learning_rate": 9.966755839385925e-06, + "loss": 0.4339, + "step": 468 + }, + { + "epoch": 0.06577840112201963, + "grad_norm": 3.1136100719837874, + "learning_rate": 9.966493852381463e-06, + "loss": 0.4572, + "step": 469 + }, + { + "epoch": 0.06591865357643759, + "grad_norm": 2.905310037847187, + "learning_rate": 9.966230840575693e-06, + "loss": 0.424, + "step": 470 + }, + { + "epoch": 0.06605890603085554, + "grad_norm": 2.9207955880839394, + "learning_rate": 9.965966804022887e-06, + "loss": 0.4279, + "step": 471 + }, + { + "epoch": 0.0661991584852735, + "grad_norm": 2.4950931077605185, + "learning_rate": 9.965701742777524e-06, + "loss": 0.3819, + "step": 472 + }, + { + "epoch": 0.06633941093969145, + "grad_norm": 2.968480629997674, + "learning_rate": 9.9654356568943e-06, + "loss": 0.4006, + "step": 473 + }, + { + "epoch": 0.06647966339410939, + "grad_norm": 2.6876135513802017, + "learning_rate": 9.965168546428122e-06, + "loss": 0.4062, + "step": 474 + }, + { + "epoch": 0.06661991584852735, + "grad_norm": 2.6265824407379954, + "learning_rate": 9.964900411434103e-06, + "loss": 0.4296, + "step": 475 + }, + { + "epoch": 0.0667601683029453, + "grad_norm": 3.589584191769265, + "learning_rate": 9.96463125196757e-06, + "loss": 0.4646, + "step": 476 + }, + { + "epoch": 0.06690042075736326, + "grad_norm": 3.0776954002541586, + "learning_rate": 9.964361068084063e-06, + "loss": 0.3919, + "step": 477 + }, + { + "epoch": 0.06704067321178121, + "grad_norm": 2.55216692314013, + "learning_rate": 9.964089859839335e-06, + "loss": 0.4514, + "step": 478 + }, + { + "epoch": 0.06718092566619915, + "grad_norm": 3.138857215864084, + "learning_rate": 9.963817627289347e-06, + "loss": 0.4173, + "step": 479 + }, + { + "epoch": 0.06732117812061711, + "grad_norm": 3.824038196388918, + "learning_rate": 9.96354437049027e-06, + "loss": 0.4645, + "step": 480 + }, + { + "epoch": 0.06746143057503506, + "grad_norm": 3.000051021142068, + "learning_rate": 9.963270089498492e-06, + "loss": 0.4441, + "step": 481 + }, + { + "epoch": 0.06760168302945302, + "grad_norm": 2.9219727525321875, + "learning_rate": 9.962994784370605e-06, + "loss": 0.4251, + "step": 482 + }, + { + "epoch": 0.06774193548387097, + "grad_norm": 2.1701257920611745, + "learning_rate": 9.96271845516342e-06, + "loss": 0.4549, + "step": 483 + }, + { + "epoch": 0.06788218793828892, + "grad_norm": 2.6204877855044986, + "learning_rate": 9.962441101933956e-06, + "loss": 0.4111, + "step": 484 + }, + { + "epoch": 0.06802244039270687, + "grad_norm": 2.9931973578388766, + "learning_rate": 9.962162724739437e-06, + "loss": 0.4482, + "step": 485 + }, + { + "epoch": 0.06816269284712483, + "grad_norm": 2.1543447950883508, + "learning_rate": 9.961883323637312e-06, + "loss": 0.397, + "step": 486 + }, + { + "epoch": 0.06830294530154278, + "grad_norm": 3.734622452330382, + "learning_rate": 9.961602898685225e-06, + "loss": 0.3931, + "step": 487 + }, + { + "epoch": 0.06844319775596074, + "grad_norm": 2.50450748360693, + "learning_rate": 9.961321449941049e-06, + "loss": 0.464, + "step": 488 + }, + { + "epoch": 0.06858345021037868, + "grad_norm": 2.241605463811463, + "learning_rate": 9.961038977462852e-06, + "loss": 0.4422, + "step": 489 + }, + { + "epoch": 0.06872370266479663, + "grad_norm": 2.500196735270521, + "learning_rate": 9.960755481308923e-06, + "loss": 0.4486, + "step": 490 + }, + { + "epoch": 0.06886395511921459, + "grad_norm": 2.105636335225222, + "learning_rate": 9.960470961537758e-06, + "loss": 0.4134, + "step": 491 + }, + { + "epoch": 0.06900420757363254, + "grad_norm": 2.1332761379363143, + "learning_rate": 9.960185418208068e-06, + "loss": 0.4311, + "step": 492 + }, + { + "epoch": 0.0691444600280505, + "grad_norm": 2.252543495257875, + "learning_rate": 9.95989885137877e-06, + "loss": 0.3975, + "step": 493 + }, + { + "epoch": 0.06928471248246844, + "grad_norm": 1.877928164000942, + "learning_rate": 9.959611261108999e-06, + "loss": 0.3964, + "step": 494 + }, + { + "epoch": 0.0694249649368864, + "grad_norm": 2.127128881053362, + "learning_rate": 9.959322647458093e-06, + "loss": 0.4041, + "step": 495 + }, + { + "epoch": 0.06956521739130435, + "grad_norm": 2.386453317220393, + "learning_rate": 9.959033010485608e-06, + "loss": 0.396, + "step": 496 + }, + { + "epoch": 0.0697054698457223, + "grad_norm": 2.2658162003440294, + "learning_rate": 9.958742350251307e-06, + "loss": 0.4343, + "step": 497 + }, + { + "epoch": 0.06984572230014026, + "grad_norm": 2.793246168750746, + "learning_rate": 9.958450666815168e-06, + "loss": 0.4587, + "step": 498 + }, + { + "epoch": 0.0699859747545582, + "grad_norm": 2.8381594525976057, + "learning_rate": 9.958157960237376e-06, + "loss": 0.4395, + "step": 499 + }, + { + "epoch": 0.07012622720897616, + "grad_norm": 2.3910692587553366, + "learning_rate": 9.95786423057833e-06, + "loss": 0.4423, + "step": 500 + }, + { + "epoch": 0.07026647966339411, + "grad_norm": 2.2315230942406568, + "learning_rate": 9.957569477898636e-06, + "loss": 0.4346, + "step": 501 + }, + { + "epoch": 0.07040673211781207, + "grad_norm": 2.716305828898804, + "learning_rate": 9.957273702259118e-06, + "loss": 0.404, + "step": 502 + }, + { + "epoch": 0.07054698457223002, + "grad_norm": 2.932740759238852, + "learning_rate": 9.956976903720806e-06, + "loss": 0.4647, + "step": 503 + }, + { + "epoch": 0.07068723702664796, + "grad_norm": 2.343874508411583, + "learning_rate": 9.956679082344941e-06, + "loss": 0.414, + "step": 504 + }, + { + "epoch": 0.07082748948106592, + "grad_norm": 2.279088080223534, + "learning_rate": 9.956380238192978e-06, + "loss": 0.3991, + "step": 505 + }, + { + "epoch": 0.07096774193548387, + "grad_norm": 1.969531070840866, + "learning_rate": 9.956080371326582e-06, + "loss": 0.4259, + "step": 506 + }, + { + "epoch": 0.07110799438990183, + "grad_norm": 3.2446055024668534, + "learning_rate": 9.955779481807626e-06, + "loss": 0.3774, + "step": 507 + }, + { + "epoch": 0.07124824684431978, + "grad_norm": 2.5281786711362937, + "learning_rate": 9.955477569698197e-06, + "loss": 0.4195, + "step": 508 + }, + { + "epoch": 0.07138849929873772, + "grad_norm": 2.253901172546274, + "learning_rate": 9.955174635060595e-06, + "loss": 0.4324, + "step": 509 + }, + { + "epoch": 0.07152875175315568, + "grad_norm": 2.2731718452163974, + "learning_rate": 9.954870677957327e-06, + "loss": 0.4136, + "step": 510 + }, + { + "epoch": 0.07166900420757363, + "grad_norm": 2.678187715203363, + "learning_rate": 9.95456569845111e-06, + "loss": 0.4948, + "step": 511 + }, + { + "epoch": 0.07180925666199159, + "grad_norm": 1.9039431512424003, + "learning_rate": 9.954259696604878e-06, + "loss": 0.4102, + "step": 512 + }, + { + "epoch": 0.07194950911640954, + "grad_norm": 3.16024633026936, + "learning_rate": 9.95395267248177e-06, + "loss": 0.4286, + "step": 513 + }, + { + "epoch": 0.07208976157082748, + "grad_norm": 2.9262212567585966, + "learning_rate": 9.95364462614514e-06, + "loss": 0.3897, + "step": 514 + }, + { + "epoch": 0.07223001402524544, + "grad_norm": 2.308117812615739, + "learning_rate": 9.95333555765855e-06, + "loss": 0.426, + "step": 515 + }, + { + "epoch": 0.0723702664796634, + "grad_norm": 2.3842910931823758, + "learning_rate": 9.953025467085774e-06, + "loss": 0.4173, + "step": 516 + }, + { + "epoch": 0.07251051893408135, + "grad_norm": 4.239549016540257, + "learning_rate": 9.952714354490799e-06, + "loss": 0.4429, + "step": 517 + }, + { + "epoch": 0.0726507713884993, + "grad_norm": 2.6658770961891856, + "learning_rate": 9.952402219937817e-06, + "loss": 0.4402, + "step": 518 + }, + { + "epoch": 0.07279102384291725, + "grad_norm": 3.6264705963520583, + "learning_rate": 9.952089063491237e-06, + "loss": 0.4296, + "step": 519 + }, + { + "epoch": 0.0729312762973352, + "grad_norm": 5.67574596459696, + "learning_rate": 9.951774885215679e-06, + "loss": 0.3891, + "step": 520 + }, + { + "epoch": 0.07307152875175316, + "grad_norm": 3.5136992881261095, + "learning_rate": 9.951459685175968e-06, + "loss": 0.4451, + "step": 521 + }, + { + "epoch": 0.07321178120617111, + "grad_norm": 6.318712859605858, + "learning_rate": 9.951143463437145e-06, + "loss": 0.4053, + "step": 522 + }, + { + "epoch": 0.07335203366058907, + "grad_norm": 2.8805837725691252, + "learning_rate": 9.95082622006446e-06, + "loss": 0.4392, + "step": 523 + }, + { + "epoch": 0.07349228611500701, + "grad_norm": 3.026507413458335, + "learning_rate": 9.950507955123372e-06, + "loss": 0.4052, + "step": 524 + }, + { + "epoch": 0.07363253856942496, + "grad_norm": 3.9677406289023214, + "learning_rate": 9.950188668679558e-06, + "loss": 0.4294, + "step": 525 + }, + { + "epoch": 0.07377279102384292, + "grad_norm": 2.6406734845828543, + "learning_rate": 9.949868360798893e-06, + "loss": 0.4434, + "step": 526 + }, + { + "epoch": 0.07391304347826087, + "grad_norm": 7.1962068314654815, + "learning_rate": 9.949547031547475e-06, + "loss": 0.444, + "step": 527 + }, + { + "epoch": 0.07405329593267883, + "grad_norm": 2.363341779958304, + "learning_rate": 9.94922468099161e-06, + "loss": 0.4043, + "step": 528 + }, + { + "epoch": 0.07419354838709677, + "grad_norm": 2.604781716512908, + "learning_rate": 9.948901309197807e-06, + "loss": 0.4414, + "step": 529 + }, + { + "epoch": 0.07433380084151472, + "grad_norm": 2.395452872764815, + "learning_rate": 9.948576916232796e-06, + "loss": 0.421, + "step": 530 + }, + { + "epoch": 0.07447405329593268, + "grad_norm": 1.869005794305938, + "learning_rate": 9.948251502163512e-06, + "loss": 0.4112, + "step": 531 + }, + { + "epoch": 0.07461430575035063, + "grad_norm": 2.058564795737436, + "learning_rate": 9.947925067057102e-06, + "loss": 0.4169, + "step": 532 + }, + { + "epoch": 0.07475455820476859, + "grad_norm": 2.178728884114789, + "learning_rate": 9.94759761098092e-06, + "loss": 0.3915, + "step": 533 + }, + { + "epoch": 0.07489481065918653, + "grad_norm": 2.4635334189493996, + "learning_rate": 9.947269134002542e-06, + "loss": 0.4013, + "step": 534 + }, + { + "epoch": 0.07503506311360449, + "grad_norm": 2.319099494526946, + "learning_rate": 9.946939636189741e-06, + "loss": 0.4422, + "step": 535 + }, + { + "epoch": 0.07517531556802244, + "grad_norm": 2.4325207446508528, + "learning_rate": 9.946609117610508e-06, + "loss": 0.4542, + "step": 536 + }, + { + "epoch": 0.0753155680224404, + "grad_norm": 2.3704946851527944, + "learning_rate": 9.946277578333045e-06, + "loss": 0.4286, + "step": 537 + }, + { + "epoch": 0.07545582047685835, + "grad_norm": 4.493791111531237, + "learning_rate": 9.945945018425759e-06, + "loss": 0.4195, + "step": 538 + }, + { + "epoch": 0.07559607293127629, + "grad_norm": 3.4720842986794302, + "learning_rate": 9.945611437957274e-06, + "loss": 0.4568, + "step": 539 + }, + { + "epoch": 0.07573632538569425, + "grad_norm": 3.338403438061192, + "learning_rate": 9.945276836996422e-06, + "loss": 0.4358, + "step": 540 + }, + { + "epoch": 0.0758765778401122, + "grad_norm": 2.053028205436276, + "learning_rate": 9.944941215612244e-06, + "loss": 0.4184, + "step": 541 + }, + { + "epoch": 0.07601683029453016, + "grad_norm": 2.228621858944948, + "learning_rate": 9.944604573873996e-06, + "loss": 0.4477, + "step": 542 + }, + { + "epoch": 0.07615708274894811, + "grad_norm": 2.261248762089278, + "learning_rate": 9.94426691185114e-06, + "loss": 0.4258, + "step": 543 + }, + { + "epoch": 0.07629733520336605, + "grad_norm": 2.819725672993946, + "learning_rate": 9.943928229613349e-06, + "loss": 0.4036, + "step": 544 + }, + { + "epoch": 0.07643758765778401, + "grad_norm": 7.578812337949622, + "learning_rate": 9.943588527230508e-06, + "loss": 0.432, + "step": 545 + }, + { + "epoch": 0.07657784011220196, + "grad_norm": 2.3253149701138542, + "learning_rate": 9.943247804772714e-06, + "loss": 0.3997, + "step": 546 + }, + { + "epoch": 0.07671809256661992, + "grad_norm": 2.6782872403476596, + "learning_rate": 9.942906062310272e-06, + "loss": 0.4297, + "step": 547 + }, + { + "epoch": 0.07685834502103787, + "grad_norm": 2.7059734529150243, + "learning_rate": 9.942563299913698e-06, + "loss": 0.4296, + "step": 548 + }, + { + "epoch": 0.07699859747545582, + "grad_norm": 3.064931410353403, + "learning_rate": 9.942219517653718e-06, + "loss": 0.4798, + "step": 549 + }, + { + "epoch": 0.07713884992987377, + "grad_norm": 2.9262448848792095, + "learning_rate": 9.94187471560127e-06, + "loss": 0.4008, + "step": 550 + }, + { + "epoch": 0.07727910238429173, + "grad_norm": 2.4556075248378026, + "learning_rate": 9.9415288938275e-06, + "loss": 0.4927, + "step": 551 + }, + { + "epoch": 0.07741935483870968, + "grad_norm": 2.536805919562837, + "learning_rate": 9.941182052403768e-06, + "loss": 0.4321, + "step": 552 + }, + { + "epoch": 0.07755960729312764, + "grad_norm": 2.5497361476748943, + "learning_rate": 9.940834191401642e-06, + "loss": 0.407, + "step": 553 + }, + { + "epoch": 0.07769985974754558, + "grad_norm": 2.6513769890760357, + "learning_rate": 9.940485310892901e-06, + "loss": 0.3805, + "step": 554 + }, + { + "epoch": 0.07784011220196353, + "grad_norm": 2.224983163834213, + "learning_rate": 9.94013541094953e-06, + "loss": 0.4337, + "step": 555 + }, + { + "epoch": 0.07798036465638149, + "grad_norm": 2.758949196818198, + "learning_rate": 9.939784491643734e-06, + "loss": 0.3878, + "step": 556 + }, + { + "epoch": 0.07812061711079944, + "grad_norm": 4.113823517125366, + "learning_rate": 9.939432553047919e-06, + "loss": 0.4511, + "step": 557 + }, + { + "epoch": 0.0782608695652174, + "grad_norm": 2.6464227535492, + "learning_rate": 9.939079595234706e-06, + "loss": 0.4624, + "step": 558 + }, + { + "epoch": 0.07840112201963534, + "grad_norm": 2.688141502288919, + "learning_rate": 9.938725618276926e-06, + "loss": 0.4662, + "step": 559 + }, + { + "epoch": 0.0785413744740533, + "grad_norm": 2.8573708102708153, + "learning_rate": 9.938370622247619e-06, + "loss": 0.4127, + "step": 560 + }, + { + "epoch": 0.07868162692847125, + "grad_norm": 2.8526057595014116, + "learning_rate": 9.938014607220036e-06, + "loss": 0.3666, + "step": 561 + }, + { + "epoch": 0.0788218793828892, + "grad_norm": 3.9592941456572706, + "learning_rate": 9.93765757326764e-06, + "loss": 0.3906, + "step": 562 + }, + { + "epoch": 0.07896213183730716, + "grad_norm": 2.932131955465213, + "learning_rate": 9.9372995204641e-06, + "loss": 0.471, + "step": 563 + }, + { + "epoch": 0.0791023842917251, + "grad_norm": 2.905563273357825, + "learning_rate": 9.936940448883299e-06, + "loss": 0.4304, + "step": 564 + }, + { + "epoch": 0.07924263674614306, + "grad_norm": 2.5895361624176068, + "learning_rate": 9.936580358599327e-06, + "loss": 0.4006, + "step": 565 + }, + { + "epoch": 0.07938288920056101, + "grad_norm": 3.4988994230395343, + "learning_rate": 9.93621924968649e-06, + "loss": 0.4132, + "step": 566 + }, + { + "epoch": 0.07952314165497897, + "grad_norm": 2.964015035696671, + "learning_rate": 9.935857122219297e-06, + "loss": 0.4112, + "step": 567 + }, + { + "epoch": 0.07966339410939692, + "grad_norm": 5.810852145342308, + "learning_rate": 9.935493976272473e-06, + "loss": 0.4046, + "step": 568 + }, + { + "epoch": 0.07980364656381486, + "grad_norm": 2.410573963340091, + "learning_rate": 9.935129811920947e-06, + "loss": 0.4047, + "step": 569 + }, + { + "epoch": 0.07994389901823282, + "grad_norm": 2.6910832492251617, + "learning_rate": 9.934764629239863e-06, + "loss": 0.4178, + "step": 570 + }, + { + "epoch": 0.08008415147265077, + "grad_norm": 3.1171618008809063, + "learning_rate": 9.934398428304577e-06, + "loss": 0.4337, + "step": 571 + }, + { + "epoch": 0.08022440392706873, + "grad_norm": 2.7934641578768775, + "learning_rate": 9.93403120919065e-06, + "loss": 0.4095, + "step": 572 + }, + { + "epoch": 0.08036465638148668, + "grad_norm": 2.688027529728785, + "learning_rate": 9.933662971973851e-06, + "loss": 0.3839, + "step": 573 + }, + { + "epoch": 0.08050490883590462, + "grad_norm": 2.9424136033901704, + "learning_rate": 9.933293716730172e-06, + "loss": 0.4243, + "step": 574 + }, + { + "epoch": 0.08064516129032258, + "grad_norm": 2.3308109432486637, + "learning_rate": 9.932923443535798e-06, + "loss": 0.4662, + "step": 575 + }, + { + "epoch": 0.08078541374474053, + "grad_norm": 2.749524248758854, + "learning_rate": 9.932552152467137e-06, + "loss": 0.3932, + "step": 576 + }, + { + "epoch": 0.08092566619915849, + "grad_norm": 2.444681982097326, + "learning_rate": 9.9321798436008e-06, + "loss": 0.4114, + "step": 577 + }, + { + "epoch": 0.08106591865357644, + "grad_norm": 3.46460717048328, + "learning_rate": 9.931806517013612e-06, + "loss": 0.411, + "step": 578 + }, + { + "epoch": 0.08120617110799438, + "grad_norm": 2.9782210407602308, + "learning_rate": 9.931432172782606e-06, + "loss": 0.4557, + "step": 579 + }, + { + "epoch": 0.08134642356241234, + "grad_norm": 2.627155145868136, + "learning_rate": 9.931056810985024e-06, + "loss": 0.3953, + "step": 580 + }, + { + "epoch": 0.0814866760168303, + "grad_norm": 6.5731086360346485, + "learning_rate": 9.93068043169832e-06, + "loss": 0.4036, + "step": 581 + }, + { + "epoch": 0.08162692847124825, + "grad_norm": 2.563433593389875, + "learning_rate": 9.930303035000159e-06, + "loss": 0.4301, + "step": 582 + }, + { + "epoch": 0.0817671809256662, + "grad_norm": 2.813436394682458, + "learning_rate": 9.929924620968409e-06, + "loss": 0.4587, + "step": 583 + }, + { + "epoch": 0.08190743338008415, + "grad_norm": 2.2688752110926553, + "learning_rate": 9.92954518968116e-06, + "loss": 0.4647, + "step": 584 + }, + { + "epoch": 0.0820476858345021, + "grad_norm": 2.4365217740162155, + "learning_rate": 9.929164741216702e-06, + "loss": 0.4226, + "step": 585 + }, + { + "epoch": 0.08218793828892006, + "grad_norm": 2.3070974390215184, + "learning_rate": 9.928783275653534e-06, + "loss": 0.3934, + "step": 586 + }, + { + "epoch": 0.08232819074333801, + "grad_norm": 1.9404043148937231, + "learning_rate": 9.928400793070375e-06, + "loss": 0.4123, + "step": 587 + }, + { + "epoch": 0.08246844319775597, + "grad_norm": 2.6048628118198662, + "learning_rate": 9.928017293546144e-06, + "loss": 0.4545, + "step": 588 + }, + { + "epoch": 0.08260869565217391, + "grad_norm": 2.237539864296864, + "learning_rate": 9.927632777159975e-06, + "loss": 0.4053, + "step": 589 + }, + { + "epoch": 0.08274894810659186, + "grad_norm": 2.106464440723593, + "learning_rate": 9.927247243991209e-06, + "loss": 0.4086, + "step": 590 + }, + { + "epoch": 0.08288920056100982, + "grad_norm": 4.108699844653068, + "learning_rate": 9.9268606941194e-06, + "loss": 0.4469, + "step": 591 + }, + { + "epoch": 0.08302945301542777, + "grad_norm": 2.2622056084572204, + "learning_rate": 9.926473127624306e-06, + "loss": 0.432, + "step": 592 + }, + { + "epoch": 0.08316970546984573, + "grad_norm": 3.1122191949742097, + "learning_rate": 9.926084544585904e-06, + "loss": 0.4296, + "step": 593 + }, + { + "epoch": 0.08330995792426367, + "grad_norm": 3.4069466534674056, + "learning_rate": 9.925694945084369e-06, + "loss": 0.4404, + "step": 594 + }, + { + "epoch": 0.08345021037868162, + "grad_norm": 2.3797838318050246, + "learning_rate": 9.925304329200098e-06, + "loss": 0.4615, + "step": 595 + }, + { + "epoch": 0.08359046283309958, + "grad_norm": 2.6397311712432208, + "learning_rate": 9.92491269701369e-06, + "loss": 0.4209, + "step": 596 + }, + { + "epoch": 0.08373071528751753, + "grad_norm": 2.2164889352816157, + "learning_rate": 9.924520048605955e-06, + "loss": 0.4023, + "step": 597 + }, + { + "epoch": 0.08387096774193549, + "grad_norm": 2.4390800441230716, + "learning_rate": 9.924126384057913e-06, + "loss": 0.4341, + "step": 598 + }, + { + "epoch": 0.08401122019635343, + "grad_norm": 2.0206406753833477, + "learning_rate": 9.923731703450794e-06, + "loss": 0.4465, + "step": 599 + }, + { + "epoch": 0.08415147265077139, + "grad_norm": 2.3044698321469497, + "learning_rate": 9.923336006866038e-06, + "loss": 0.3857, + "step": 600 + }, + { + "epoch": 0.08429172510518934, + "grad_norm": 2.4004909410613644, + "learning_rate": 9.922939294385294e-06, + "loss": 0.4197, + "step": 601 + }, + { + "epoch": 0.0844319775596073, + "grad_norm": 2.401645962239625, + "learning_rate": 9.922541566090422e-06, + "loss": 0.4212, + "step": 602 + }, + { + "epoch": 0.08457223001402525, + "grad_norm": 2.099050388889036, + "learning_rate": 9.922142822063488e-06, + "loss": 0.3966, + "step": 603 + }, + { + "epoch": 0.08471248246844319, + "grad_norm": 3.3282125398826805, + "learning_rate": 9.921743062386773e-06, + "loss": 0.3842, + "step": 604 + }, + { + "epoch": 0.08485273492286115, + "grad_norm": 2.508136668896997, + "learning_rate": 9.92134228714276e-06, + "loss": 0.4454, + "step": 605 + }, + { + "epoch": 0.0849929873772791, + "grad_norm": 2.700962502618605, + "learning_rate": 9.920940496414153e-06, + "loss": 0.4267, + "step": 606 + }, + { + "epoch": 0.08513323983169706, + "grad_norm": 2.771552506430004, + "learning_rate": 9.920537690283853e-06, + "loss": 0.4177, + "step": 607 + }, + { + "epoch": 0.08527349228611501, + "grad_norm": 3.061680275338762, + "learning_rate": 9.92013386883498e-06, + "loss": 0.3966, + "step": 608 + }, + { + "epoch": 0.08541374474053295, + "grad_norm": 3.2256444775001243, + "learning_rate": 9.919729032150855e-06, + "loss": 0.4103, + "step": 609 + }, + { + "epoch": 0.08555399719495091, + "grad_norm": 2.309504966537246, + "learning_rate": 9.91932318031502e-06, + "loss": 0.4252, + "step": 610 + }, + { + "epoch": 0.08569424964936886, + "grad_norm": 2.297905100576898, + "learning_rate": 9.918916313411213e-06, + "loss": 0.3915, + "step": 611 + }, + { + "epoch": 0.08583450210378682, + "grad_norm": 2.9628528046881666, + "learning_rate": 9.918508431523392e-06, + "loss": 0.3876, + "step": 612 + }, + { + "epoch": 0.08597475455820477, + "grad_norm": 2.581944451193296, + "learning_rate": 9.91809953473572e-06, + "loss": 0.4353, + "step": 613 + }, + { + "epoch": 0.08611500701262272, + "grad_norm": 6.792611584917445, + "learning_rate": 9.917689623132568e-06, + "loss": 0.43, + "step": 614 + }, + { + "epoch": 0.08625525946704067, + "grad_norm": 3.082437505405656, + "learning_rate": 9.91727869679852e-06, + "loss": 0.4128, + "step": 615 + }, + { + "epoch": 0.08639551192145863, + "grad_norm": 2.9533436204833015, + "learning_rate": 9.916866755818368e-06, + "loss": 0.4259, + "step": 616 + }, + { + "epoch": 0.08653576437587658, + "grad_norm": 2.707020681121143, + "learning_rate": 9.916453800277115e-06, + "loss": 0.4051, + "step": 617 + }, + { + "epoch": 0.08667601683029454, + "grad_norm": 4.012798813956297, + "learning_rate": 9.916039830259967e-06, + "loss": 0.4268, + "step": 618 + }, + { + "epoch": 0.08681626928471248, + "grad_norm": 2.510555965204729, + "learning_rate": 9.915624845852347e-06, + "loss": 0.4317, + "step": 619 + }, + { + "epoch": 0.08695652173913043, + "grad_norm": 3.1114674337633086, + "learning_rate": 9.915208847139883e-06, + "loss": 0.4148, + "step": 620 + }, + { + "epoch": 0.08709677419354839, + "grad_norm": 2.6402545195046807, + "learning_rate": 9.914791834208415e-06, + "loss": 0.4007, + "step": 621 + }, + { + "epoch": 0.08723702664796634, + "grad_norm": 3.57474888273078, + "learning_rate": 9.91437380714399e-06, + "loss": 0.3921, + "step": 622 + }, + { + "epoch": 0.0873772791023843, + "grad_norm": 2.6138733790288042, + "learning_rate": 9.913954766032861e-06, + "loss": 0.4088, + "step": 623 + }, + { + "epoch": 0.08751753155680224, + "grad_norm": 2.9242495103828965, + "learning_rate": 9.9135347109615e-06, + "loss": 0.4424, + "step": 624 + }, + { + "epoch": 0.0876577840112202, + "grad_norm": 2.130216814566768, + "learning_rate": 9.91311364201658e-06, + "loss": 0.43, + "step": 625 + }, + { + "epoch": 0.08779803646563815, + "grad_norm": 2.4060293319385533, + "learning_rate": 9.912691559284985e-06, + "loss": 0.4293, + "step": 626 + }, + { + "epoch": 0.0879382889200561, + "grad_norm": 3.259512258914764, + "learning_rate": 9.912268462853811e-06, + "loss": 0.4063, + "step": 627 + }, + { + "epoch": 0.08807854137447406, + "grad_norm": 2.4595403671410714, + "learning_rate": 9.911844352810359e-06, + "loss": 0.3681, + "step": 628 + }, + { + "epoch": 0.088218793828892, + "grad_norm": 2.8738075768940536, + "learning_rate": 9.91141922924214e-06, + "loss": 0.4042, + "step": 629 + }, + { + "epoch": 0.08835904628330996, + "grad_norm": 2.46226712592563, + "learning_rate": 9.910993092236878e-06, + "loss": 0.4459, + "step": 630 + }, + { + "epoch": 0.08849929873772791, + "grad_norm": 3.8799595012714176, + "learning_rate": 9.910565941882501e-06, + "loss": 0.4457, + "step": 631 + }, + { + "epoch": 0.08863955119214587, + "grad_norm": 2.2927665036378775, + "learning_rate": 9.910137778267153e-06, + "loss": 0.3553, + "step": 632 + }, + { + "epoch": 0.08877980364656382, + "grad_norm": 2.375620108511329, + "learning_rate": 9.909708601479178e-06, + "loss": 0.4236, + "step": 633 + }, + { + "epoch": 0.08892005610098176, + "grad_norm": 2.371846363965291, + "learning_rate": 9.909278411607134e-06, + "loss": 0.3367, + "step": 634 + }, + { + "epoch": 0.08906030855539972, + "grad_norm": 2.7951895467670806, + "learning_rate": 9.908847208739788e-06, + "loss": 0.4098, + "step": 635 + }, + { + "epoch": 0.08920056100981767, + "grad_norm": 2.639822753389235, + "learning_rate": 9.908414992966119e-06, + "loss": 0.4755, + "step": 636 + }, + { + "epoch": 0.08934081346423563, + "grad_norm": 2.785910056295134, + "learning_rate": 9.907981764375307e-06, + "loss": 0.4334, + "step": 637 + }, + { + "epoch": 0.08948106591865358, + "grad_norm": 2.852628660117186, + "learning_rate": 9.907547523056748e-06, + "loss": 0.4167, + "step": 638 + }, + { + "epoch": 0.08962131837307152, + "grad_norm": 2.7571054043827314, + "learning_rate": 9.907112269100045e-06, + "loss": 0.4233, + "step": 639 + }, + { + "epoch": 0.08976157082748948, + "grad_norm": 3.4838484977865765, + "learning_rate": 9.90667600259501e-06, + "loss": 0.4067, + "step": 640 + }, + { + "epoch": 0.08990182328190743, + "grad_norm": 2.853054043241597, + "learning_rate": 9.906238723631662e-06, + "loss": 0.4684, + "step": 641 + }, + { + "epoch": 0.09004207573632539, + "grad_norm": 2.3391280313509992, + "learning_rate": 9.905800432300232e-06, + "loss": 0.4375, + "step": 642 + }, + { + "epoch": 0.09018232819074334, + "grad_norm": 2.635695421355907, + "learning_rate": 9.905361128691156e-06, + "loss": 0.4424, + "step": 643 + }, + { + "epoch": 0.09032258064516129, + "grad_norm": 2.9267385874827374, + "learning_rate": 9.904920812895082e-06, + "loss": 0.3907, + "step": 644 + }, + { + "epoch": 0.09046283309957924, + "grad_norm": 3.1888003221441816, + "learning_rate": 9.904479485002869e-06, + "loss": 0.404, + "step": 645 + }, + { + "epoch": 0.0906030855539972, + "grad_norm": 2.4179096861791174, + "learning_rate": 9.904037145105577e-06, + "loss": 0.3998, + "step": 646 + }, + { + "epoch": 0.09074333800841515, + "grad_norm": 4.689159658989563, + "learning_rate": 9.903593793294484e-06, + "loss": 0.4289, + "step": 647 + }, + { + "epoch": 0.0908835904628331, + "grad_norm": 3.2351038014946774, + "learning_rate": 9.903149429661072e-06, + "loss": 0.4068, + "step": 648 + }, + { + "epoch": 0.09102384291725105, + "grad_norm": 2.789755342482678, + "learning_rate": 9.902704054297028e-06, + "loss": 0.4724, + "step": 649 + }, + { + "epoch": 0.091164095371669, + "grad_norm": 3.7744858195798634, + "learning_rate": 9.902257667294259e-06, + "loss": 0.4228, + "step": 650 + }, + { + "epoch": 0.09130434782608696, + "grad_norm": 2.68132800720131, + "learning_rate": 9.901810268744868e-06, + "loss": 0.3946, + "step": 651 + }, + { + "epoch": 0.09144460028050491, + "grad_norm": 3.2079914620968415, + "learning_rate": 9.901361858741177e-06, + "loss": 0.3925, + "step": 652 + }, + { + "epoch": 0.09158485273492287, + "grad_norm": 2.6380354242903548, + "learning_rate": 9.900912437375708e-06, + "loss": 0.4617, + "step": 653 + }, + { + "epoch": 0.09172510518934081, + "grad_norm": 2.6376430671542788, + "learning_rate": 9.900462004741198e-06, + "loss": 0.4913, + "step": 654 + }, + { + "epoch": 0.09186535764375876, + "grad_norm": 2.546610438173832, + "learning_rate": 9.90001056093059e-06, + "loss": 0.3981, + "step": 655 + }, + { + "epoch": 0.09200561009817672, + "grad_norm": 2.508642136669367, + "learning_rate": 9.899558106037039e-06, + "loss": 0.4517, + "step": 656 + }, + { + "epoch": 0.09214586255259467, + "grad_norm": 2.6862991666488, + "learning_rate": 9.899104640153904e-06, + "loss": 0.3706, + "step": 657 + }, + { + "epoch": 0.09228611500701263, + "grad_norm": 2.628741549679193, + "learning_rate": 9.898650163374751e-06, + "loss": 0.4695, + "step": 658 + }, + { + "epoch": 0.09242636746143057, + "grad_norm": 2.694523173314856, + "learning_rate": 9.898194675793365e-06, + "loss": 0.4437, + "step": 659 + }, + { + "epoch": 0.09256661991584852, + "grad_norm": 3.146987619061112, + "learning_rate": 9.897738177503729e-06, + "loss": 0.4315, + "step": 660 + }, + { + "epoch": 0.09270687237026648, + "grad_norm": 2.968430672587619, + "learning_rate": 9.897280668600037e-06, + "loss": 0.4503, + "step": 661 + }, + { + "epoch": 0.09284712482468443, + "grad_norm": 2.4055953683512343, + "learning_rate": 9.896822149176695e-06, + "loss": 0.4188, + "step": 662 + }, + { + "epoch": 0.09298737727910239, + "grad_norm": 2.9929472354682223, + "learning_rate": 9.896362619328314e-06, + "loss": 0.4037, + "step": 663 + }, + { + "epoch": 0.09312762973352033, + "grad_norm": 2.250263622522839, + "learning_rate": 9.895902079149715e-06, + "loss": 0.4621, + "step": 664 + }, + { + "epoch": 0.09326788218793829, + "grad_norm": 2.427144083946365, + "learning_rate": 9.895440528735927e-06, + "loss": 0.4444, + "step": 665 + }, + { + "epoch": 0.09340813464235624, + "grad_norm": 3.001947724356723, + "learning_rate": 9.894977968182189e-06, + "loss": 0.4368, + "step": 666 + }, + { + "epoch": 0.0935483870967742, + "grad_norm": 1.7161206333778094, + "learning_rate": 9.894514397583947e-06, + "loss": 0.3923, + "step": 667 + }, + { + "epoch": 0.09368863955119215, + "grad_norm": 2.050272211749686, + "learning_rate": 9.894049817036854e-06, + "loss": 0.4058, + "step": 668 + }, + { + "epoch": 0.09382889200561009, + "grad_norm": 2.6703072571856445, + "learning_rate": 9.893584226636773e-06, + "loss": 0.4187, + "step": 669 + }, + { + "epoch": 0.09396914446002805, + "grad_norm": 2.29934829103304, + "learning_rate": 9.893117626479778e-06, + "loss": 0.4224, + "step": 670 + }, + { + "epoch": 0.094109396914446, + "grad_norm": 2.117931952978256, + "learning_rate": 9.892650016662144e-06, + "loss": 0.4186, + "step": 671 + }, + { + "epoch": 0.09424964936886396, + "grad_norm": 2.928117254973665, + "learning_rate": 9.892181397280365e-06, + "loss": 0.3833, + "step": 672 + }, + { + "epoch": 0.09438990182328191, + "grad_norm": 1.9800764368696875, + "learning_rate": 9.891711768431131e-06, + "loss": 0.4262, + "step": 673 + }, + { + "epoch": 0.09453015427769985, + "grad_norm": 2.631203088772597, + "learning_rate": 9.891241130211353e-06, + "loss": 0.424, + "step": 674 + }, + { + "epoch": 0.09467040673211781, + "grad_norm": 2.573355036365654, + "learning_rate": 9.89076948271814e-06, + "loss": 0.4323, + "step": 675 + }, + { + "epoch": 0.09481065918653576, + "grad_norm": 2.5308140508800396, + "learning_rate": 9.89029682604881e-06, + "loss": 0.4255, + "step": 676 + }, + { + "epoch": 0.09495091164095372, + "grad_norm": 6.263074703077405, + "learning_rate": 9.8898231603009e-06, + "loss": 0.4706, + "step": 677 + }, + { + "epoch": 0.09509116409537167, + "grad_norm": 1.9784421163386703, + "learning_rate": 9.889348485572144e-06, + "loss": 0.3975, + "step": 678 + }, + { + "epoch": 0.09523141654978962, + "grad_norm": 1.7511227956748854, + "learning_rate": 9.888872801960486e-06, + "loss": 0.4006, + "step": 679 + }, + { + "epoch": 0.09537166900420757, + "grad_norm": 2.2266958096399727, + "learning_rate": 9.888396109564082e-06, + "loss": 0.4519, + "step": 680 + }, + { + "epoch": 0.09551192145862553, + "grad_norm": 3.908997324897981, + "learning_rate": 9.887918408481295e-06, + "loss": 0.4007, + "step": 681 + }, + { + "epoch": 0.09565217391304348, + "grad_norm": 2.439371808592663, + "learning_rate": 9.887439698810694e-06, + "loss": 0.4283, + "step": 682 + }, + { + "epoch": 0.09579242636746144, + "grad_norm": 3.543474895153141, + "learning_rate": 9.886959980651056e-06, + "loss": 0.4233, + "step": 683 + }, + { + "epoch": 0.09593267882187938, + "grad_norm": 2.3517648603493555, + "learning_rate": 9.886479254101372e-06, + "loss": 0.4512, + "step": 684 + }, + { + "epoch": 0.09607293127629733, + "grad_norm": 2.459871285180292, + "learning_rate": 9.885997519260831e-06, + "loss": 0.4385, + "step": 685 + }, + { + "epoch": 0.09621318373071529, + "grad_norm": 2.7796031919820336, + "learning_rate": 9.885514776228837e-06, + "loss": 0.425, + "step": 686 + }, + { + "epoch": 0.09635343618513324, + "grad_norm": 4.073655057481395, + "learning_rate": 9.885031025105005e-06, + "loss": 0.4417, + "step": 687 + }, + { + "epoch": 0.0964936886395512, + "grad_norm": 3.3321846572247087, + "learning_rate": 9.884546265989148e-06, + "loss": 0.392, + "step": 688 + }, + { + "epoch": 0.09663394109396914, + "grad_norm": 3.0825323318803, + "learning_rate": 9.884060498981297e-06, + "loss": 0.4687, + "step": 689 + }, + { + "epoch": 0.0967741935483871, + "grad_norm": 2.915756265152888, + "learning_rate": 9.883573724181683e-06, + "loss": 0.4377, + "step": 690 + }, + { + "epoch": 0.09691444600280505, + "grad_norm": 1.9791401576893655, + "learning_rate": 9.883085941690752e-06, + "loss": 0.4135, + "step": 691 + }, + { + "epoch": 0.097054698457223, + "grad_norm": 2.3294381144422176, + "learning_rate": 9.882597151609153e-06, + "loss": 0.463, + "step": 692 + }, + { + "epoch": 0.09719495091164096, + "grad_norm": 2.5857971706519183, + "learning_rate": 9.882107354037743e-06, + "loss": 0.4294, + "step": 693 + }, + { + "epoch": 0.0973352033660589, + "grad_norm": 3.377955237824373, + "learning_rate": 9.881616549077591e-06, + "loss": 0.4355, + "step": 694 + }, + { + "epoch": 0.09747545582047686, + "grad_norm": 2.0239998172337748, + "learning_rate": 9.881124736829968e-06, + "loss": 0.3903, + "step": 695 + }, + { + "epoch": 0.09761570827489481, + "grad_norm": 2.324796409306966, + "learning_rate": 9.880631917396358e-06, + "loss": 0.4011, + "step": 696 + }, + { + "epoch": 0.09775596072931277, + "grad_norm": 1.9300017931045346, + "learning_rate": 9.880138090878452e-06, + "loss": 0.4243, + "step": 697 + }, + { + "epoch": 0.09789621318373072, + "grad_norm": 2.0679366388483134, + "learning_rate": 9.879643257378146e-06, + "loss": 0.4019, + "step": 698 + }, + { + "epoch": 0.09803646563814866, + "grad_norm": 2.6681583722534667, + "learning_rate": 9.879147416997544e-06, + "loss": 0.4249, + "step": 699 + }, + { + "epoch": 0.09817671809256662, + "grad_norm": 2.508378484917336, + "learning_rate": 9.878650569838963e-06, + "loss": 0.4455, + "step": 700 + }, + { + "epoch": 0.09831697054698457, + "grad_norm": 2.8439817229379787, + "learning_rate": 9.878152716004921e-06, + "loss": 0.3742, + "step": 701 + }, + { + "epoch": 0.09845722300140253, + "grad_norm": 2.649351249312995, + "learning_rate": 9.877653855598148e-06, + "loss": 0.4235, + "step": 702 + }, + { + "epoch": 0.09859747545582048, + "grad_norm": 4.604803786348187, + "learning_rate": 9.87715398872158e-06, + "loss": 0.4095, + "step": 703 + }, + { + "epoch": 0.09873772791023842, + "grad_norm": 3.1705496889656524, + "learning_rate": 9.87665311547836e-06, + "loss": 0.3916, + "step": 704 + }, + { + "epoch": 0.09887798036465638, + "grad_norm": 2.3647080331150105, + "learning_rate": 9.87615123597184e-06, + "loss": 0.4688, + "step": 705 + }, + { + "epoch": 0.09901823281907433, + "grad_norm": 3.7469128935284113, + "learning_rate": 9.875648350305582e-06, + "loss": 0.384, + "step": 706 + }, + { + "epoch": 0.09915848527349229, + "grad_norm": 2.7851332226158423, + "learning_rate": 9.87514445858335e-06, + "loss": 0.3958, + "step": 707 + }, + { + "epoch": 0.09929873772791024, + "grad_norm": 4.455041947943315, + "learning_rate": 9.874639560909118e-06, + "loss": 0.4299, + "step": 708 + }, + { + "epoch": 0.09943899018232819, + "grad_norm": 2.4501353713024754, + "learning_rate": 9.87413365738707e-06, + "loss": 0.4572, + "step": 709 + }, + { + "epoch": 0.09957924263674614, + "grad_norm": 2.2838563009242687, + "learning_rate": 9.873626748121597e-06, + "loss": 0.3923, + "step": 710 + }, + { + "epoch": 0.0997194950911641, + "grad_norm": 2.3631663396973073, + "learning_rate": 9.873118833217294e-06, + "loss": 0.4312, + "step": 711 + }, + { + "epoch": 0.09985974754558205, + "grad_norm": 2.4806088866049043, + "learning_rate": 9.872609912778966e-06, + "loss": 0.4005, + "step": 712 + }, + { + "epoch": 0.1, + "grad_norm": 2.0023346625757017, + "learning_rate": 9.872099986911625e-06, + "loss": 0.4369, + "step": 713 + }, + { + "epoch": 0.10014025245441795, + "grad_norm": 3.2275309367806186, + "learning_rate": 9.871589055720489e-06, + "loss": 0.4363, + "step": 714 + }, + { + "epoch": 0.1002805049088359, + "grad_norm": 1.9443648761030785, + "learning_rate": 9.87107711931099e-06, + "loss": 0.3703, + "step": 715 + }, + { + "epoch": 0.10042075736325386, + "grad_norm": 2.094399536499077, + "learning_rate": 9.870564177788758e-06, + "loss": 0.433, + "step": 716 + }, + { + "epoch": 0.10056100981767181, + "grad_norm": 3.0015091279193054, + "learning_rate": 9.870050231259636e-06, + "loss": 0.3796, + "step": 717 + }, + { + "epoch": 0.10070126227208977, + "grad_norm": 2.31805381914514, + "learning_rate": 9.869535279829674e-06, + "loss": 0.4246, + "step": 718 + }, + { + "epoch": 0.10084151472650771, + "grad_norm": 2.3824709788233105, + "learning_rate": 9.86901932360513e-06, + "loss": 0.4044, + "step": 719 + }, + { + "epoch": 0.10098176718092566, + "grad_norm": 1.8945659201684293, + "learning_rate": 9.868502362692463e-06, + "loss": 0.4089, + "step": 720 + }, + { + "epoch": 0.10112201963534362, + "grad_norm": 2.5057419640142853, + "learning_rate": 9.867984397198349e-06, + "loss": 0.4598, + "step": 721 + }, + { + "epoch": 0.10126227208976157, + "grad_norm": 2.706606877078446, + "learning_rate": 9.867465427229665e-06, + "loss": 0.4719, + "step": 722 + }, + { + "epoch": 0.10140252454417953, + "grad_norm": 2.8465567351234817, + "learning_rate": 9.866945452893497e-06, + "loss": 0.4188, + "step": 723 + }, + { + "epoch": 0.10154277699859747, + "grad_norm": 2.111371261339523, + "learning_rate": 9.866424474297139e-06, + "loss": 0.4069, + "step": 724 + }, + { + "epoch": 0.10168302945301542, + "grad_norm": 3.4225936901345775, + "learning_rate": 9.86590249154809e-06, + "loss": 0.4465, + "step": 725 + }, + { + "epoch": 0.10182328190743338, + "grad_norm": 2.1327977441968957, + "learning_rate": 9.865379504754056e-06, + "loss": 0.4267, + "step": 726 + }, + { + "epoch": 0.10196353436185134, + "grad_norm": 3.295065288215254, + "learning_rate": 9.864855514022955e-06, + "loss": 0.3909, + "step": 727 + }, + { + "epoch": 0.10210378681626929, + "grad_norm": 2.563103488477448, + "learning_rate": 9.864330519462906e-06, + "loss": 0.4222, + "step": 728 + }, + { + "epoch": 0.10224403927068723, + "grad_norm": 2.233006304904973, + "learning_rate": 9.86380452118224e-06, + "loss": 0.4569, + "step": 729 + }, + { + "epoch": 0.10238429172510519, + "grad_norm": 2.0686464911897042, + "learning_rate": 9.863277519289493e-06, + "loss": 0.4134, + "step": 730 + }, + { + "epoch": 0.10252454417952314, + "grad_norm": 2.2164254705048148, + "learning_rate": 9.862749513893405e-06, + "loss": 0.4526, + "step": 731 + }, + { + "epoch": 0.1026647966339411, + "grad_norm": 2.1318399636356853, + "learning_rate": 9.862220505102933e-06, + "loss": 0.4812, + "step": 732 + }, + { + "epoch": 0.10280504908835905, + "grad_norm": 3.259832327807662, + "learning_rate": 9.861690493027226e-06, + "loss": 0.4289, + "step": 733 + }, + { + "epoch": 0.10294530154277699, + "grad_norm": 1.8344255430676608, + "learning_rate": 9.861159477775653e-06, + "loss": 0.4309, + "step": 734 + }, + { + "epoch": 0.10308555399719495, + "grad_norm": 2.3153925152211743, + "learning_rate": 9.860627459457785e-06, + "loss": 0.3976, + "step": 735 + }, + { + "epoch": 0.1032258064516129, + "grad_norm": 2.228805108621936, + "learning_rate": 9.8600944381834e-06, + "loss": 0.3945, + "step": 736 + }, + { + "epoch": 0.10336605890603086, + "grad_norm": 2.957773092887003, + "learning_rate": 9.859560414062483e-06, + "loss": 0.4616, + "step": 737 + }, + { + "epoch": 0.10350631136044881, + "grad_norm": 4.192957268506033, + "learning_rate": 9.859025387205225e-06, + "loss": 0.3786, + "step": 738 + }, + { + "epoch": 0.10364656381486675, + "grad_norm": 2.261538799815849, + "learning_rate": 9.858489357722028e-06, + "loss": 0.4094, + "step": 739 + }, + { + "epoch": 0.10378681626928471, + "grad_norm": 1.9781449809579394, + "learning_rate": 9.857952325723496e-06, + "loss": 0.4258, + "step": 740 + }, + { + "epoch": 0.10392706872370266, + "grad_norm": 3.0515006141618226, + "learning_rate": 9.857414291320441e-06, + "loss": 0.4287, + "step": 741 + }, + { + "epoch": 0.10406732117812062, + "grad_norm": 2.941254671403553, + "learning_rate": 9.856875254623883e-06, + "loss": 0.4309, + "step": 742 + }, + { + "epoch": 0.10420757363253857, + "grad_norm": 3.2492503255239478, + "learning_rate": 9.85633521574505e-06, + "loss": 0.3882, + "step": 743 + }, + { + "epoch": 0.10434782608695652, + "grad_norm": 2.8602378668936614, + "learning_rate": 9.855794174795374e-06, + "loss": 0.431, + "step": 744 + }, + { + "epoch": 0.10448807854137447, + "grad_norm": 2.165234300123747, + "learning_rate": 9.855252131886495e-06, + "loss": 0.4007, + "step": 745 + }, + { + "epoch": 0.10462833099579243, + "grad_norm": 2.9322890467431586, + "learning_rate": 9.854709087130261e-06, + "loss": 0.4117, + "step": 746 + }, + { + "epoch": 0.10476858345021038, + "grad_norm": 3.8150778638260667, + "learning_rate": 9.854165040638724e-06, + "loss": 0.3997, + "step": 747 + }, + { + "epoch": 0.10490883590462834, + "grad_norm": 2.3792130095538573, + "learning_rate": 9.853619992524144e-06, + "loss": 0.425, + "step": 748 + }, + { + "epoch": 0.10504908835904628, + "grad_norm": 2.2364872904944515, + "learning_rate": 9.85307394289899e-06, + "loss": 0.3893, + "step": 749 + }, + { + "epoch": 0.10518934081346423, + "grad_norm": 2.5435134140329834, + "learning_rate": 9.852526891875932e-06, + "loss": 0.4489, + "step": 750 + }, + { + "epoch": 0.10532959326788219, + "grad_norm": 3.293084446875812, + "learning_rate": 9.851978839567856e-06, + "loss": 0.4373, + "step": 751 + }, + { + "epoch": 0.10546984572230014, + "grad_norm": 2.177715325379126, + "learning_rate": 9.851429786087842e-06, + "loss": 0.457, + "step": 752 + }, + { + "epoch": 0.1056100981767181, + "grad_norm": 3.253567571519585, + "learning_rate": 9.850879731549188e-06, + "loss": 0.4378, + "step": 753 + }, + { + "epoch": 0.10575035063113604, + "grad_norm": 2.5268366450838724, + "learning_rate": 9.85032867606539e-06, + "loss": 0.4572, + "step": 754 + }, + { + "epoch": 0.105890603085554, + "grad_norm": 2.540245179785859, + "learning_rate": 9.84977661975016e-06, + "loss": 0.3745, + "step": 755 + }, + { + "epoch": 0.10603085553997195, + "grad_norm": 3.3641119627764624, + "learning_rate": 9.849223562717404e-06, + "loss": 0.4701, + "step": 756 + }, + { + "epoch": 0.1061711079943899, + "grad_norm": 3.351479411484646, + "learning_rate": 9.848669505081248e-06, + "loss": 0.4161, + "step": 757 + }, + { + "epoch": 0.10631136044880786, + "grad_norm": 3.0313470569291834, + "learning_rate": 9.848114446956015e-06, + "loss": 0.4443, + "step": 758 + }, + { + "epoch": 0.1064516129032258, + "grad_norm": 3.397566339492137, + "learning_rate": 9.847558388456237e-06, + "loss": 0.4039, + "step": 759 + }, + { + "epoch": 0.10659186535764376, + "grad_norm": 3.120050707184945, + "learning_rate": 9.847001329696653e-06, + "loss": 0.4159, + "step": 760 + }, + { + "epoch": 0.10673211781206171, + "grad_norm": 3.133738849317252, + "learning_rate": 9.846443270792209e-06, + "loss": 0.417, + "step": 761 + }, + { + "epoch": 0.10687237026647967, + "grad_norm": 2.726730210357395, + "learning_rate": 9.845884211858054e-06, + "loss": 0.3905, + "step": 762 + }, + { + "epoch": 0.10701262272089762, + "grad_norm": 2.4747384751989214, + "learning_rate": 9.84532415300955e-06, + "loss": 0.4183, + "step": 763 + }, + { + "epoch": 0.10715287517531556, + "grad_norm": 2.678313233783397, + "learning_rate": 9.84476309436226e-06, + "loss": 0.3922, + "step": 764 + }, + { + "epoch": 0.10729312762973352, + "grad_norm": 2.292223313668754, + "learning_rate": 9.844201036031952e-06, + "loss": 0.4401, + "step": 765 + }, + { + "epoch": 0.10743338008415147, + "grad_norm": 2.389456369829056, + "learning_rate": 9.843637978134604e-06, + "loss": 0.3891, + "step": 766 + }, + { + "epoch": 0.10757363253856943, + "grad_norm": 2.6836818808308855, + "learning_rate": 9.843073920786402e-06, + "loss": 0.4201, + "step": 767 + }, + { + "epoch": 0.10771388499298738, + "grad_norm": 2.0464798640554407, + "learning_rate": 9.84250886410373e-06, + "loss": 0.4135, + "step": 768 + }, + { + "epoch": 0.10785413744740532, + "grad_norm": 3.007364769841982, + "learning_rate": 9.841942808203188e-06, + "loss": 0.3846, + "step": 769 + }, + { + "epoch": 0.10799438990182328, + "grad_norm": 2.807392823915712, + "learning_rate": 9.841375753201575e-06, + "loss": 0.3458, + "step": 770 + }, + { + "epoch": 0.10813464235624123, + "grad_norm": 2.824721759833576, + "learning_rate": 9.8408076992159e-06, + "loss": 0.4155, + "step": 771 + }, + { + "epoch": 0.10827489481065919, + "grad_norm": 2.45043866259281, + "learning_rate": 9.840238646363378e-06, + "loss": 0.4109, + "step": 772 + }, + { + "epoch": 0.10841514726507714, + "grad_norm": 2.7188753997997464, + "learning_rate": 9.839668594761427e-06, + "loss": 0.4679, + "step": 773 + }, + { + "epoch": 0.10855539971949509, + "grad_norm": 2.3847993165681585, + "learning_rate": 9.839097544527674e-06, + "loss": 0.4439, + "step": 774 + }, + { + "epoch": 0.10869565217391304, + "grad_norm": 2.776012675770339, + "learning_rate": 9.838525495779952e-06, + "loss": 0.4231, + "step": 775 + }, + { + "epoch": 0.108835904628331, + "grad_norm": 7.936009304610304, + "learning_rate": 9.837952448636298e-06, + "loss": 0.4633, + "step": 776 + }, + { + "epoch": 0.10897615708274895, + "grad_norm": 3.7583725605092515, + "learning_rate": 9.837378403214957e-06, + "loss": 0.4045, + "step": 777 + }, + { + "epoch": 0.1091164095371669, + "grad_norm": 3.702018089485178, + "learning_rate": 9.836803359634379e-06, + "loss": 0.428, + "step": 778 + }, + { + "epoch": 0.10925666199158485, + "grad_norm": 2.394826948567772, + "learning_rate": 9.836227318013219e-06, + "loss": 0.4636, + "step": 779 + }, + { + "epoch": 0.1093969144460028, + "grad_norm": 2.836542238434387, + "learning_rate": 9.835650278470343e-06, + "loss": 0.4353, + "step": 780 + }, + { + "epoch": 0.10953716690042076, + "grad_norm": 2.2712740222624177, + "learning_rate": 9.835072241124815e-06, + "loss": 0.3667, + "step": 781 + }, + { + "epoch": 0.10967741935483871, + "grad_norm": 3.353774543413632, + "learning_rate": 9.834493206095911e-06, + "loss": 0.414, + "step": 782 + }, + { + "epoch": 0.10981767180925667, + "grad_norm": 2.513856633531474, + "learning_rate": 9.83391317350311e-06, + "loss": 0.4186, + "step": 783 + }, + { + "epoch": 0.10995792426367461, + "grad_norm": 2.148116986035295, + "learning_rate": 9.833332143466099e-06, + "loss": 0.4224, + "step": 784 + }, + { + "epoch": 0.11009817671809256, + "grad_norm": 2.120995057340813, + "learning_rate": 9.832750116104768e-06, + "loss": 0.4, + "step": 785 + }, + { + "epoch": 0.11023842917251052, + "grad_norm": 2.9049666657890763, + "learning_rate": 9.832167091539215e-06, + "loss": 0.4156, + "step": 786 + }, + { + "epoch": 0.11037868162692847, + "grad_norm": 2.0270828931859612, + "learning_rate": 9.831583069889742e-06, + "loss": 0.4413, + "step": 787 + }, + { + "epoch": 0.11051893408134643, + "grad_norm": 2.719837190579929, + "learning_rate": 9.830998051276858e-06, + "loss": 0.4132, + "step": 788 + }, + { + "epoch": 0.11065918653576437, + "grad_norm": 6.453595825458017, + "learning_rate": 9.83041203582128e-06, + "loss": 0.413, + "step": 789 + }, + { + "epoch": 0.11079943899018233, + "grad_norm": 2.571938600367921, + "learning_rate": 9.829825023643926e-06, + "loss": 0.4153, + "step": 790 + }, + { + "epoch": 0.11093969144460028, + "grad_norm": 4.072679881882209, + "learning_rate": 9.829237014865921e-06, + "loss": 0.4188, + "step": 791 + }, + { + "epoch": 0.11107994389901824, + "grad_norm": 2.567249269955474, + "learning_rate": 9.828648009608598e-06, + "loss": 0.4763, + "step": 792 + }, + { + "epoch": 0.11122019635343619, + "grad_norm": 3.0306669638024233, + "learning_rate": 9.828058007993496e-06, + "loss": 0.4551, + "step": 793 + }, + { + "epoch": 0.11136044880785413, + "grad_norm": 2.5668201780444253, + "learning_rate": 9.827467010142352e-06, + "loss": 0.4332, + "step": 794 + }, + { + "epoch": 0.11150070126227209, + "grad_norm": 2.196304694391903, + "learning_rate": 9.82687501617712e-06, + "loss": 0.429, + "step": 795 + }, + { + "epoch": 0.11164095371669004, + "grad_norm": 2.0864192106401043, + "learning_rate": 9.826282026219953e-06, + "loss": 0.4301, + "step": 796 + }, + { + "epoch": 0.111781206171108, + "grad_norm": 2.979370074675202, + "learning_rate": 9.825688040393206e-06, + "loss": 0.4356, + "step": 797 + }, + { + "epoch": 0.11192145862552595, + "grad_norm": 2.701817455362829, + "learning_rate": 9.825093058819448e-06, + "loss": 0.4379, + "step": 798 + }, + { + "epoch": 0.1120617110799439, + "grad_norm": 2.8518501032190433, + "learning_rate": 9.824497081621449e-06, + "loss": 0.3844, + "step": 799 + }, + { + "epoch": 0.11220196353436185, + "grad_norm": 3.0058642927083716, + "learning_rate": 9.823900108922183e-06, + "loss": 0.4006, + "step": 800 + }, + { + "epoch": 0.1123422159887798, + "grad_norm": 2.612046752744941, + "learning_rate": 9.823302140844833e-06, + "loss": 0.4017, + "step": 801 + }, + { + "epoch": 0.11248246844319776, + "grad_norm": 11.518853696376025, + "learning_rate": 9.822703177512783e-06, + "loss": 0.4233, + "step": 802 + }, + { + "epoch": 0.11262272089761571, + "grad_norm": 2.718768152636732, + "learning_rate": 9.822103219049625e-06, + "loss": 0.4791, + "step": 803 + }, + { + "epoch": 0.11276297335203365, + "grad_norm": 3.4314865225341475, + "learning_rate": 9.82150226557916e-06, + "loss": 0.4689, + "step": 804 + }, + { + "epoch": 0.11290322580645161, + "grad_norm": 3.1112021440478483, + "learning_rate": 9.820900317225388e-06, + "loss": 0.4518, + "step": 805 + }, + { + "epoch": 0.11304347826086956, + "grad_norm": 2.399677465541807, + "learning_rate": 9.820297374112518e-06, + "loss": 0.3647, + "step": 806 + }, + { + "epoch": 0.11318373071528752, + "grad_norm": 2.4453337549620375, + "learning_rate": 9.81969343636496e-06, + "loss": 0.439, + "step": 807 + }, + { + "epoch": 0.11332398316970548, + "grad_norm": 2.6232748266317882, + "learning_rate": 9.819088504107335e-06, + "loss": 0.3992, + "step": 808 + }, + { + "epoch": 0.11346423562412342, + "grad_norm": 4.417556649035361, + "learning_rate": 9.818482577464466e-06, + "loss": 0.4111, + "step": 809 + }, + { + "epoch": 0.11360448807854137, + "grad_norm": 2.2717629259425403, + "learning_rate": 9.817875656561382e-06, + "loss": 0.4294, + "step": 810 + }, + { + "epoch": 0.11374474053295933, + "grad_norm": 2.3737422222634765, + "learning_rate": 9.817267741523318e-06, + "loss": 0.4107, + "step": 811 + }, + { + "epoch": 0.11388499298737728, + "grad_norm": 3.380901968878859, + "learning_rate": 9.816658832475709e-06, + "loss": 0.4314, + "step": 812 + }, + { + "epoch": 0.11402524544179524, + "grad_norm": 3.1898527251301148, + "learning_rate": 9.816048929544202e-06, + "loss": 0.3904, + "step": 813 + }, + { + "epoch": 0.11416549789621318, + "grad_norm": 2.8553862553276845, + "learning_rate": 9.815438032854648e-06, + "loss": 0.3901, + "step": 814 + }, + { + "epoch": 0.11430575035063113, + "grad_norm": 2.5111782032191092, + "learning_rate": 9.814826142533098e-06, + "loss": 0.3837, + "step": 815 + }, + { + "epoch": 0.11444600280504909, + "grad_norm": 3.4285194302203448, + "learning_rate": 9.814213258705813e-06, + "loss": 0.4319, + "step": 816 + }, + { + "epoch": 0.11458625525946704, + "grad_norm": 3.1309580653503524, + "learning_rate": 9.813599381499256e-06, + "loss": 0.4072, + "step": 817 + }, + { + "epoch": 0.114726507713885, + "grad_norm": 2.482951014059725, + "learning_rate": 9.812984511040099e-06, + "loss": 0.4587, + "step": 818 + }, + { + "epoch": 0.11486676016830294, + "grad_norm": 2.1088340913459738, + "learning_rate": 9.812368647455212e-06, + "loss": 0.408, + "step": 819 + }, + { + "epoch": 0.1150070126227209, + "grad_norm": 4.06988909783507, + "learning_rate": 9.811751790871677e-06, + "loss": 0.4454, + "step": 820 + }, + { + "epoch": 0.11514726507713885, + "grad_norm": 1.8865566144506738, + "learning_rate": 9.811133941416778e-06, + "loss": 0.392, + "step": 821 + }, + { + "epoch": 0.1152875175315568, + "grad_norm": 12.298435911891652, + "learning_rate": 9.810515099218004e-06, + "loss": 0.4354, + "step": 822 + }, + { + "epoch": 0.11542776998597476, + "grad_norm": 3.182664232354045, + "learning_rate": 9.809895264403046e-06, + "loss": 0.4247, + "step": 823 + }, + { + "epoch": 0.1155680224403927, + "grad_norm": 2.258350875711285, + "learning_rate": 9.809274437099807e-06, + "loss": 0.4286, + "step": 824 + }, + { + "epoch": 0.11570827489481066, + "grad_norm": 2.4596109357066225, + "learning_rate": 9.808652617436386e-06, + "loss": 0.4023, + "step": 825 + }, + { + "epoch": 0.11584852734922861, + "grad_norm": 2.6044013769554795, + "learning_rate": 9.808029805541097e-06, + "loss": 0.3784, + "step": 826 + }, + { + "epoch": 0.11598877980364657, + "grad_norm": 2.3819645608016073, + "learning_rate": 9.807406001542447e-06, + "loss": 0.4447, + "step": 827 + }, + { + "epoch": 0.11612903225806452, + "grad_norm": 2.540778416579888, + "learning_rate": 9.806781205569155e-06, + "loss": 0.4323, + "step": 828 + }, + { + "epoch": 0.11626928471248246, + "grad_norm": 2.5052212551574122, + "learning_rate": 9.806155417750146e-06, + "loss": 0.4013, + "step": 829 + }, + { + "epoch": 0.11640953716690042, + "grad_norm": 3.5601511121674716, + "learning_rate": 9.805528638214543e-06, + "loss": 0.4157, + "step": 830 + }, + { + "epoch": 0.11654978962131837, + "grad_norm": 3.4347167798684883, + "learning_rate": 9.80490086709168e-06, + "loss": 0.426, + "step": 831 + }, + { + "epoch": 0.11669004207573633, + "grad_norm": 2.1668793011521705, + "learning_rate": 9.804272104511093e-06, + "loss": 0.4199, + "step": 832 + }, + { + "epoch": 0.11683029453015428, + "grad_norm": 2.790386323878258, + "learning_rate": 9.803642350602524e-06, + "loss": 0.4641, + "step": 833 + }, + { + "epoch": 0.11697054698457222, + "grad_norm": 2.229598267981134, + "learning_rate": 9.803011605495916e-06, + "loss": 0.4156, + "step": 834 + }, + { + "epoch": 0.11711079943899018, + "grad_norm": 2.372155242567933, + "learning_rate": 9.802379869321419e-06, + "loss": 0.3742, + "step": 835 + }, + { + "epoch": 0.11725105189340813, + "grad_norm": 2.5267910714167305, + "learning_rate": 9.801747142209388e-06, + "loss": 0.41, + "step": 836 + }, + { + "epoch": 0.11739130434782609, + "grad_norm": 2.798671635514141, + "learning_rate": 9.801113424290381e-06, + "loss": 0.4195, + "step": 837 + }, + { + "epoch": 0.11753155680224404, + "grad_norm": 2.201043951993503, + "learning_rate": 9.800478715695165e-06, + "loss": 0.3917, + "step": 838 + }, + { + "epoch": 0.11767180925666199, + "grad_norm": 2.8375879954671084, + "learning_rate": 9.799843016554701e-06, + "loss": 0.4322, + "step": 839 + }, + { + "epoch": 0.11781206171107994, + "grad_norm": 3.0192007103507255, + "learning_rate": 9.799206327000168e-06, + "loss": 0.4355, + "step": 840 + }, + { + "epoch": 0.1179523141654979, + "grad_norm": 2.4579407868853793, + "learning_rate": 9.798568647162939e-06, + "loss": 0.3999, + "step": 841 + }, + { + "epoch": 0.11809256661991585, + "grad_norm": 2.868648312829615, + "learning_rate": 9.797929977174593e-06, + "loss": 0.4202, + "step": 842 + }, + { + "epoch": 0.1182328190743338, + "grad_norm": 2.444392704656942, + "learning_rate": 9.79729031716692e-06, + "loss": 0.4535, + "step": 843 + }, + { + "epoch": 0.11837307152875175, + "grad_norm": 2.4466970990477193, + "learning_rate": 9.796649667271905e-06, + "loss": 0.4754, + "step": 844 + }, + { + "epoch": 0.1185133239831697, + "grad_norm": 2.1970605373037944, + "learning_rate": 9.796008027621744e-06, + "loss": 0.4266, + "step": 845 + }, + { + "epoch": 0.11865357643758766, + "grad_norm": 3.0419740042594596, + "learning_rate": 9.795365398348833e-06, + "loss": 0.418, + "step": 846 + }, + { + "epoch": 0.11879382889200561, + "grad_norm": 3.108357632075865, + "learning_rate": 9.794721779585776e-06, + "loss": 0.4455, + "step": 847 + }, + { + "epoch": 0.11893408134642357, + "grad_norm": 3.702574354992489, + "learning_rate": 9.794077171465376e-06, + "loss": 0.4027, + "step": 848 + }, + { + "epoch": 0.11907433380084151, + "grad_norm": 2.5720740314288455, + "learning_rate": 9.79343157412065e-06, + "loss": 0.4496, + "step": 849 + }, + { + "epoch": 0.11921458625525946, + "grad_norm": 2.6534680327852422, + "learning_rate": 9.792784987684804e-06, + "loss": 0.4453, + "step": 850 + }, + { + "epoch": 0.11935483870967742, + "grad_norm": 3.4562435653654218, + "learning_rate": 9.792137412291265e-06, + "loss": 0.4067, + "step": 851 + }, + { + "epoch": 0.11949509116409537, + "grad_norm": 3.6623015574572646, + "learning_rate": 9.791488848073649e-06, + "loss": 0.4412, + "step": 852 + }, + { + "epoch": 0.11963534361851333, + "grad_norm": 2.262821962831728, + "learning_rate": 9.790839295165786e-06, + "loss": 0.3982, + "step": 853 + }, + { + "epoch": 0.11977559607293127, + "grad_norm": 2.088117053722709, + "learning_rate": 9.790188753701704e-06, + "loss": 0.4554, + "step": 854 + }, + { + "epoch": 0.11991584852734923, + "grad_norm": 2.795286356087783, + "learning_rate": 9.789537223815642e-06, + "loss": 0.4554, + "step": 855 + }, + { + "epoch": 0.12005610098176718, + "grad_norm": 2.168853414307077, + "learning_rate": 9.788884705642035e-06, + "loss": 0.4453, + "step": 856 + }, + { + "epoch": 0.12019635343618514, + "grad_norm": 2.8575380188289246, + "learning_rate": 9.788231199315528e-06, + "loss": 0.4203, + "step": 857 + }, + { + "epoch": 0.12033660589060309, + "grad_norm": 2.2694481524226155, + "learning_rate": 9.787576704970965e-06, + "loss": 0.4025, + "step": 858 + }, + { + "epoch": 0.12047685834502103, + "grad_norm": 1.9459592670909753, + "learning_rate": 9.786921222743397e-06, + "loss": 0.3947, + "step": 859 + }, + { + "epoch": 0.12061711079943899, + "grad_norm": 4.6113394996021295, + "learning_rate": 9.78626475276808e-06, + "loss": 0.4228, + "step": 860 + }, + { + "epoch": 0.12075736325385694, + "grad_norm": 2.7197508449600467, + "learning_rate": 9.78560729518047e-06, + "loss": 0.3511, + "step": 861 + }, + { + "epoch": 0.1208976157082749, + "grad_norm": 3.3173452964190853, + "learning_rate": 9.78494885011623e-06, + "loss": 0.4611, + "step": 862 + }, + { + "epoch": 0.12103786816269285, + "grad_norm": 2.579088620518456, + "learning_rate": 9.784289417711225e-06, + "loss": 0.4245, + "step": 863 + }, + { + "epoch": 0.1211781206171108, + "grad_norm": 2.41681768230522, + "learning_rate": 9.783628998101525e-06, + "loss": 0.3745, + "step": 864 + }, + { + "epoch": 0.12131837307152875, + "grad_norm": 2.52596154364446, + "learning_rate": 9.7829675914234e-06, + "loss": 0.4442, + "step": 865 + }, + { + "epoch": 0.1214586255259467, + "grad_norm": 2.217709256935435, + "learning_rate": 9.782305197813332e-06, + "loss": 0.3895, + "step": 866 + }, + { + "epoch": 0.12159887798036466, + "grad_norm": 3.0233614810977767, + "learning_rate": 9.781641817407997e-06, + "loss": 0.4022, + "step": 867 + }, + { + "epoch": 0.12173913043478261, + "grad_norm": 3.2395492666401555, + "learning_rate": 9.78097745034428e-06, + "loss": 0.4439, + "step": 868 + }, + { + "epoch": 0.12187938288920055, + "grad_norm": 2.616566507483794, + "learning_rate": 9.780312096759269e-06, + "loss": 0.4003, + "step": 869 + }, + { + "epoch": 0.12201963534361851, + "grad_norm": 2.7211906615180435, + "learning_rate": 9.779645756790255e-06, + "loss": 0.3798, + "step": 870 + }, + { + "epoch": 0.12215988779803647, + "grad_norm": 2.6812172129656013, + "learning_rate": 9.77897843057473e-06, + "loss": 0.4149, + "step": 871 + }, + { + "epoch": 0.12230014025245442, + "grad_norm": 2.76963326033593, + "learning_rate": 9.778310118250397e-06, + "loss": 0.3767, + "step": 872 + }, + { + "epoch": 0.12244039270687238, + "grad_norm": 2.943419316016232, + "learning_rate": 9.777640819955154e-06, + "loss": 0.3773, + "step": 873 + }, + { + "epoch": 0.12258064516129032, + "grad_norm": 2.0576256460804427, + "learning_rate": 9.776970535827109e-06, + "loss": 0.4245, + "step": 874 + }, + { + "epoch": 0.12272089761570827, + "grad_norm": 2.1816033588119184, + "learning_rate": 9.776299266004565e-06, + "loss": 0.3999, + "step": 875 + }, + { + "epoch": 0.12286115007012623, + "grad_norm": 2.684569957257161, + "learning_rate": 9.775627010626039e-06, + "loss": 0.3935, + "step": 876 + }, + { + "epoch": 0.12300140252454418, + "grad_norm": 2.4828538376263944, + "learning_rate": 9.774953769830245e-06, + "loss": 0.4415, + "step": 877 + }, + { + "epoch": 0.12314165497896214, + "grad_norm": 2.1320748083935577, + "learning_rate": 9.7742795437561e-06, + "loss": 0.4531, + "step": 878 + }, + { + "epoch": 0.12328190743338008, + "grad_norm": 2.6471226672034236, + "learning_rate": 9.77360433254273e-06, + "loss": 0.4024, + "step": 879 + }, + { + "epoch": 0.12342215988779803, + "grad_norm": 2.0813399378500494, + "learning_rate": 9.772928136329454e-06, + "loss": 0.4201, + "step": 880 + }, + { + "epoch": 0.12356241234221599, + "grad_norm": 2.34991904585982, + "learning_rate": 9.772250955255804e-06, + "loss": 0.4167, + "step": 881 + }, + { + "epoch": 0.12370266479663394, + "grad_norm": 2.025609680513328, + "learning_rate": 9.77157278946151e-06, + "loss": 0.4023, + "step": 882 + }, + { + "epoch": 0.1238429172510519, + "grad_norm": 3.895887622692978, + "learning_rate": 9.77089363908651e-06, + "loss": 0.4525, + "step": 883 + }, + { + "epoch": 0.12398316970546984, + "grad_norm": 2.4118179701255458, + "learning_rate": 9.770213504270939e-06, + "loss": 0.4303, + "step": 884 + }, + { + "epoch": 0.1241234221598878, + "grad_norm": 2.529015484920472, + "learning_rate": 9.769532385155137e-06, + "loss": 0.4452, + "step": 885 + }, + { + "epoch": 0.12426367461430575, + "grad_norm": 2.3987646221532235, + "learning_rate": 9.768850281879651e-06, + "loss": 0.3993, + "step": 886 + }, + { + "epoch": 0.1244039270687237, + "grad_norm": 3.0088480170100538, + "learning_rate": 9.768167194585227e-06, + "loss": 0.4171, + "step": 887 + }, + { + "epoch": 0.12454417952314166, + "grad_norm": 3.5148509890144113, + "learning_rate": 9.767483123412817e-06, + "loss": 0.4016, + "step": 888 + }, + { + "epoch": 0.1246844319775596, + "grad_norm": 2.9459247306950833, + "learning_rate": 9.766798068503572e-06, + "loss": 0.4302, + "step": 889 + }, + { + "epoch": 0.12482468443197756, + "grad_norm": 2.6089431496710183, + "learning_rate": 9.766112029998847e-06, + "loss": 0.4113, + "step": 890 + }, + { + "epoch": 0.12496493688639551, + "grad_norm": 2.138543557972791, + "learning_rate": 9.765425008040206e-06, + "loss": 0.414, + "step": 891 + }, + { + "epoch": 0.12510518934081347, + "grad_norm": 3.734299279885769, + "learning_rate": 9.764737002769406e-06, + "loss": 0.4288, + "step": 892 + }, + { + "epoch": 0.12524544179523142, + "grad_norm": 3.4320503732768075, + "learning_rate": 9.764048014328417e-06, + "loss": 0.4266, + "step": 893 + }, + { + "epoch": 0.12538569424964938, + "grad_norm": 2.7518215215660256, + "learning_rate": 9.763358042859403e-06, + "loss": 0.4388, + "step": 894 + }, + { + "epoch": 0.12552594670406733, + "grad_norm": 2.5858301791598963, + "learning_rate": 9.762667088504737e-06, + "loss": 0.4239, + "step": 895 + }, + { + "epoch": 0.1256661991584853, + "grad_norm": 2.5484777918658708, + "learning_rate": 9.761975151406991e-06, + "loss": 0.3979, + "step": 896 + }, + { + "epoch": 0.12580645161290321, + "grad_norm": 2.997854260150099, + "learning_rate": 9.761282231708942e-06, + "loss": 0.4462, + "step": 897 + }, + { + "epoch": 0.12594670406732117, + "grad_norm": 2.349823267358076, + "learning_rate": 9.76058832955357e-06, + "loss": 0.4192, + "step": 898 + }, + { + "epoch": 0.12608695652173912, + "grad_norm": 4.1198200879119335, + "learning_rate": 9.759893445084059e-06, + "loss": 0.3794, + "step": 899 + }, + { + "epoch": 0.12622720897615708, + "grad_norm": 3.0641468835986925, + "learning_rate": 9.759197578443787e-06, + "loss": 0.4222, + "step": 900 + }, + { + "epoch": 0.12636746143057503, + "grad_norm": 2.389251214527134, + "learning_rate": 9.758500729776348e-06, + "loss": 0.4267, + "step": 901 + }, + { + "epoch": 0.126507713884993, + "grad_norm": 3.261539806094398, + "learning_rate": 9.757802899225527e-06, + "loss": 0.3882, + "step": 902 + }, + { + "epoch": 0.12664796633941094, + "grad_norm": 8.668433913977546, + "learning_rate": 9.757104086935319e-06, + "loss": 0.3938, + "step": 903 + }, + { + "epoch": 0.1267882187938289, + "grad_norm": 2.1680835590377336, + "learning_rate": 9.756404293049918e-06, + "loss": 0.4654, + "step": 904 + }, + { + "epoch": 0.12692847124824685, + "grad_norm": 3.875022642007943, + "learning_rate": 9.755703517713722e-06, + "loss": 0.4069, + "step": 905 + }, + { + "epoch": 0.1270687237026648, + "grad_norm": 2.9620203136852528, + "learning_rate": 9.755001761071333e-06, + "loss": 0.4111, + "step": 906 + }, + { + "epoch": 0.12720897615708274, + "grad_norm": 3.3351562442810816, + "learning_rate": 9.754299023267548e-06, + "loss": 0.392, + "step": 907 + }, + { + "epoch": 0.1273492286115007, + "grad_norm": 2.243303931479571, + "learning_rate": 9.753595304447379e-06, + "loss": 0.374, + "step": 908 + }, + { + "epoch": 0.12748948106591865, + "grad_norm": 3.672374934799436, + "learning_rate": 9.752890604756029e-06, + "loss": 0.4477, + "step": 909 + }, + { + "epoch": 0.1276297335203366, + "grad_norm": 2.5034939192844154, + "learning_rate": 9.75218492433891e-06, + "loss": 0.4434, + "step": 910 + }, + { + "epoch": 0.12776998597475456, + "grad_norm": 2.384482477048135, + "learning_rate": 9.751478263341631e-06, + "loss": 0.4102, + "step": 911 + }, + { + "epoch": 0.1279102384291725, + "grad_norm": 2.4232628568172254, + "learning_rate": 9.75077062191001e-06, + "loss": 0.381, + "step": 912 + }, + { + "epoch": 0.12805049088359047, + "grad_norm": 4.9417764035027725, + "learning_rate": 9.750062000190063e-06, + "loss": 0.4107, + "step": 913 + }, + { + "epoch": 0.12819074333800842, + "grad_norm": 4.520883211250388, + "learning_rate": 9.74935239832801e-06, + "loss": 0.4231, + "step": 914 + }, + { + "epoch": 0.12833099579242638, + "grad_norm": 3.435301615259594, + "learning_rate": 9.748641816470268e-06, + "loss": 0.4046, + "step": 915 + }, + { + "epoch": 0.12847124824684433, + "grad_norm": 3.6134869821075655, + "learning_rate": 9.747930254763467e-06, + "loss": 0.4328, + "step": 916 + }, + { + "epoch": 0.12861150070126226, + "grad_norm": 4.823240309114587, + "learning_rate": 9.747217713354428e-06, + "loss": 0.4142, + "step": 917 + }, + { + "epoch": 0.12875175315568022, + "grad_norm": 3.1103437438579125, + "learning_rate": 9.746504192390181e-06, + "loss": 0.4307, + "step": 918 + }, + { + "epoch": 0.12889200561009817, + "grad_norm": 2.9851943883428844, + "learning_rate": 9.745789692017955e-06, + "loss": 0.4469, + "step": 919 + }, + { + "epoch": 0.12903225806451613, + "grad_norm": 2.440124468208358, + "learning_rate": 9.745074212385183e-06, + "loss": 0.4925, + "step": 920 + }, + { + "epoch": 0.12917251051893408, + "grad_norm": 3.015941385858175, + "learning_rate": 9.7443577536395e-06, + "loss": 0.3947, + "step": 921 + }, + { + "epoch": 0.12931276297335204, + "grad_norm": 3.8219685777762846, + "learning_rate": 9.74364031592874e-06, + "loss": 0.4495, + "step": 922 + }, + { + "epoch": 0.12945301542777, + "grad_norm": 3.3775169029914087, + "learning_rate": 9.742921899400942e-06, + "loss": 0.4206, + "step": 923 + }, + { + "epoch": 0.12959326788218795, + "grad_norm": 2.3453750063237404, + "learning_rate": 9.742202504204348e-06, + "loss": 0.4499, + "step": 924 + }, + { + "epoch": 0.1297335203366059, + "grad_norm": 2.4678149806678364, + "learning_rate": 9.741482130487398e-06, + "loss": 0.4204, + "step": 925 + }, + { + "epoch": 0.12987377279102386, + "grad_norm": 1.8421419534399446, + "learning_rate": 9.740760778398737e-06, + "loss": 0.4204, + "step": 926 + }, + { + "epoch": 0.13001402524544178, + "grad_norm": 3.2702555702966323, + "learning_rate": 9.740038448087213e-06, + "loss": 0.4287, + "step": 927 + }, + { + "epoch": 0.13015427769985974, + "grad_norm": 2.8921565766744832, + "learning_rate": 9.739315139701868e-06, + "loss": 0.3727, + "step": 928 + }, + { + "epoch": 0.1302945301542777, + "grad_norm": 3.3246820398187786, + "learning_rate": 9.738590853391959e-06, + "loss": 0.3994, + "step": 929 + }, + { + "epoch": 0.13043478260869565, + "grad_norm": 3.3943434335178178, + "learning_rate": 9.737865589306932e-06, + "loss": 0.3994, + "step": 930 + }, + { + "epoch": 0.1305750350631136, + "grad_norm": 2.353189912789768, + "learning_rate": 9.737139347596443e-06, + "loss": 0.4097, + "step": 931 + }, + { + "epoch": 0.13071528751753156, + "grad_norm": 2.677553355792189, + "learning_rate": 9.736412128410346e-06, + "loss": 0.4509, + "step": 932 + }, + { + "epoch": 0.1308555399719495, + "grad_norm": 3.2895532092470074, + "learning_rate": 9.735683931898697e-06, + "loss": 0.3784, + "step": 933 + }, + { + "epoch": 0.13099579242636747, + "grad_norm": 2.628733477647113, + "learning_rate": 9.734954758211754e-06, + "loss": 0.3452, + "step": 934 + }, + { + "epoch": 0.13113604488078542, + "grad_norm": 2.404896838697879, + "learning_rate": 9.734224607499978e-06, + "loss": 0.3717, + "step": 935 + }, + { + "epoch": 0.13127629733520338, + "grad_norm": 2.388326699495475, + "learning_rate": 9.733493479914031e-06, + "loss": 0.379, + "step": 936 + }, + { + "epoch": 0.1314165497896213, + "grad_norm": 3.113983572855933, + "learning_rate": 9.732761375604773e-06, + "loss": 0.4343, + "step": 937 + }, + { + "epoch": 0.13155680224403926, + "grad_norm": 2.8419655032121263, + "learning_rate": 9.732028294723273e-06, + "loss": 0.4055, + "step": 938 + }, + { + "epoch": 0.13169705469845722, + "grad_norm": 2.582295783684513, + "learning_rate": 9.731294237420795e-06, + "loss": 0.3656, + "step": 939 + }, + { + "epoch": 0.13183730715287517, + "grad_norm": 2.5227941868161188, + "learning_rate": 9.730559203848807e-06, + "loss": 0.4454, + "step": 940 + }, + { + "epoch": 0.13197755960729313, + "grad_norm": 2.8135475750865475, + "learning_rate": 9.729823194158977e-06, + "loss": 0.4111, + "step": 941 + }, + { + "epoch": 0.13211781206171108, + "grad_norm": 2.9114703801997153, + "learning_rate": 9.729086208503174e-06, + "loss": 0.4128, + "step": 942 + }, + { + "epoch": 0.13225806451612904, + "grad_norm": 2.6629800247377187, + "learning_rate": 9.728348247033474e-06, + "loss": 0.3627, + "step": 943 + }, + { + "epoch": 0.132398316970547, + "grad_norm": 2.1609781367631555, + "learning_rate": 9.727609309902148e-06, + "loss": 0.4349, + "step": 944 + }, + { + "epoch": 0.13253856942496495, + "grad_norm": 2.8621154326932787, + "learning_rate": 9.72686939726167e-06, + "loss": 0.4068, + "step": 945 + }, + { + "epoch": 0.1326788218793829, + "grad_norm": 3.307619061622956, + "learning_rate": 9.726128509264715e-06, + "loss": 0.4172, + "step": 946 + }, + { + "epoch": 0.13281907433380083, + "grad_norm": 4.036258159309049, + "learning_rate": 9.725386646064164e-06, + "loss": 0.387, + "step": 947 + }, + { + "epoch": 0.13295932678821878, + "grad_norm": 2.617637663885529, + "learning_rate": 9.724643807813092e-06, + "loss": 0.3825, + "step": 948 + }, + { + "epoch": 0.13309957924263674, + "grad_norm": 8.049112249671543, + "learning_rate": 9.723899994664779e-06, + "loss": 0.3981, + "step": 949 + }, + { + "epoch": 0.1332398316970547, + "grad_norm": 2.627640531431317, + "learning_rate": 9.723155206772705e-06, + "loss": 0.3793, + "step": 950 + }, + { + "epoch": 0.13338008415147265, + "grad_norm": 2.887861425726099, + "learning_rate": 9.722409444290555e-06, + "loss": 0.3941, + "step": 951 + }, + { + "epoch": 0.1335203366058906, + "grad_norm": 2.5768409074280902, + "learning_rate": 9.721662707372208e-06, + "loss": 0.3862, + "step": 952 + }, + { + "epoch": 0.13366058906030856, + "grad_norm": 3.131483752268668, + "learning_rate": 9.720914996171748e-06, + "loss": 0.4969, + "step": 953 + }, + { + "epoch": 0.13380084151472652, + "grad_norm": 3.2768043556143454, + "learning_rate": 9.720166310843464e-06, + "loss": 0.4152, + "step": 954 + }, + { + "epoch": 0.13394109396914447, + "grad_norm": 3.9301772068441476, + "learning_rate": 9.719416651541839e-06, + "loss": 0.4067, + "step": 955 + }, + { + "epoch": 0.13408134642356243, + "grad_norm": 3.0164926496302686, + "learning_rate": 9.71866601842156e-06, + "loss": 0.4036, + "step": 956 + }, + { + "epoch": 0.13422159887798035, + "grad_norm": 3.7718778115604166, + "learning_rate": 9.717914411637515e-06, + "loss": 0.4159, + "step": 957 + }, + { + "epoch": 0.1343618513323983, + "grad_norm": 3.5147383864653383, + "learning_rate": 9.717161831344792e-06, + "loss": 0.3842, + "step": 958 + }, + { + "epoch": 0.13450210378681626, + "grad_norm": 2.3479731848841063, + "learning_rate": 9.716408277698684e-06, + "loss": 0.4142, + "step": 959 + }, + { + "epoch": 0.13464235624123422, + "grad_norm": 3.2119418241902236, + "learning_rate": 9.71565375085468e-06, + "loss": 0.4163, + "step": 960 + }, + { + "epoch": 0.13478260869565217, + "grad_norm": 3.0045776410583005, + "learning_rate": 9.714898250968468e-06, + "loss": 0.411, + "step": 961 + }, + { + "epoch": 0.13492286115007013, + "grad_norm": 2.7489467684690556, + "learning_rate": 9.714141778195945e-06, + "loss": 0.4149, + "step": 962 + }, + { + "epoch": 0.13506311360448808, + "grad_norm": 2.616605688303831, + "learning_rate": 9.713384332693199e-06, + "loss": 0.3891, + "step": 963 + }, + { + "epoch": 0.13520336605890604, + "grad_norm": 3.734042867151906, + "learning_rate": 9.712625914616528e-06, + "loss": 0.4248, + "step": 964 + }, + { + "epoch": 0.135343618513324, + "grad_norm": 14.348713865695746, + "learning_rate": 9.711866524122424e-06, + "loss": 0.4384, + "step": 965 + }, + { + "epoch": 0.13548387096774195, + "grad_norm": 2.625704534534177, + "learning_rate": 9.711106161367583e-06, + "loss": 0.342, + "step": 966 + }, + { + "epoch": 0.13562412342215988, + "grad_norm": 3.9837470064609795, + "learning_rate": 9.710344826508901e-06, + "loss": 0.4036, + "step": 967 + }, + { + "epoch": 0.13576437587657783, + "grad_norm": 2.5930050618938405, + "learning_rate": 9.70958251970347e-06, + "loss": 0.4178, + "step": 968 + }, + { + "epoch": 0.13590462833099579, + "grad_norm": 2.8534213761385976, + "learning_rate": 9.708819241108594e-06, + "loss": 0.3618, + "step": 969 + }, + { + "epoch": 0.13604488078541374, + "grad_norm": 3.294126838683279, + "learning_rate": 9.708054990881763e-06, + "loss": 0.38, + "step": 970 + }, + { + "epoch": 0.1361851332398317, + "grad_norm": 2.622642548417069, + "learning_rate": 9.70728976918068e-06, + "loss": 0.3757, + "step": 971 + }, + { + "epoch": 0.13632538569424965, + "grad_norm": 3.0116166271900346, + "learning_rate": 9.706523576163238e-06, + "loss": 0.4254, + "step": 972 + }, + { + "epoch": 0.1364656381486676, + "grad_norm": 2.645765278659375, + "learning_rate": 9.70575641198754e-06, + "loss": 0.411, + "step": 973 + }, + { + "epoch": 0.13660589060308556, + "grad_norm": 4.805513140442834, + "learning_rate": 9.704988276811883e-06, + "loss": 0.4039, + "step": 974 + }, + { + "epoch": 0.13674614305750352, + "grad_norm": 3.4626146648714484, + "learning_rate": 9.704219170794766e-06, + "loss": 0.4016, + "step": 975 + }, + { + "epoch": 0.13688639551192147, + "grad_norm": 2.688336973934779, + "learning_rate": 9.703449094094891e-06, + "loss": 0.3756, + "step": 976 + }, + { + "epoch": 0.1370266479663394, + "grad_norm": 4.203176958975426, + "learning_rate": 9.702678046871157e-06, + "loss": 0.3975, + "step": 977 + }, + { + "epoch": 0.13716690042075735, + "grad_norm": 1.929001288469437, + "learning_rate": 9.701906029282662e-06, + "loss": 0.4516, + "step": 978 + }, + { + "epoch": 0.1373071528751753, + "grad_norm": 3.0748518373103986, + "learning_rate": 9.701133041488707e-06, + "loss": 0.3766, + "step": 979 + }, + { + "epoch": 0.13744740532959326, + "grad_norm": 2.981742776574924, + "learning_rate": 9.700359083648795e-06, + "loss": 0.451, + "step": 980 + }, + { + "epoch": 0.13758765778401122, + "grad_norm": 3.729218821809074, + "learning_rate": 9.699584155922625e-06, + "loss": 0.4326, + "step": 981 + }, + { + "epoch": 0.13772791023842917, + "grad_norm": 2.8034787774504792, + "learning_rate": 9.698808258470098e-06, + "loss": 0.3542, + "step": 982 + }, + { + "epoch": 0.13786816269284713, + "grad_norm": 2.4394320142701442, + "learning_rate": 9.698031391451317e-06, + "loss": 0.4563, + "step": 983 + }, + { + "epoch": 0.13800841514726508, + "grad_norm": 2.885935345428439, + "learning_rate": 9.69725355502658e-06, + "loss": 0.4053, + "step": 984 + }, + { + "epoch": 0.13814866760168304, + "grad_norm": 3.043594393972688, + "learning_rate": 9.69647474935639e-06, + "loss": 0.4153, + "step": 985 + }, + { + "epoch": 0.138288920056101, + "grad_norm": 2.764984487728043, + "learning_rate": 9.695694974601447e-06, + "loss": 0.408, + "step": 986 + }, + { + "epoch": 0.13842917251051892, + "grad_norm": 2.4223383306484383, + "learning_rate": 9.694914230922655e-06, + "loss": 0.4151, + "step": 987 + }, + { + "epoch": 0.13856942496493688, + "grad_norm": 3.6252609685157493, + "learning_rate": 9.69413251848111e-06, + "loss": 0.3885, + "step": 988 + }, + { + "epoch": 0.13870967741935483, + "grad_norm": 2.2759624499423015, + "learning_rate": 9.693349837438115e-06, + "loss": 0.3923, + "step": 989 + }, + { + "epoch": 0.1388499298737728, + "grad_norm": 2.9365669554266276, + "learning_rate": 9.692566187955174e-06, + "loss": 0.3845, + "step": 990 + }, + { + "epoch": 0.13899018232819074, + "grad_norm": 3.3294872665883593, + "learning_rate": 9.691781570193983e-06, + "loss": 0.481, + "step": 991 + }, + { + "epoch": 0.1391304347826087, + "grad_norm": 3.469740167833233, + "learning_rate": 9.690995984316446e-06, + "loss": 0.3969, + "step": 992 + }, + { + "epoch": 0.13927068723702665, + "grad_norm": 2.1178940160744526, + "learning_rate": 9.69020943048466e-06, + "loss": 0.4025, + "step": 993 + }, + { + "epoch": 0.1394109396914446, + "grad_norm": 2.4580767761667897, + "learning_rate": 9.689421908860928e-06, + "loss": 0.4416, + "step": 994 + }, + { + "epoch": 0.13955119214586256, + "grad_norm": 2.97540597149203, + "learning_rate": 9.688633419607746e-06, + "loss": 0.4181, + "step": 995 + }, + { + "epoch": 0.13969144460028052, + "grad_norm": 2.33622826645214, + "learning_rate": 9.687843962887817e-06, + "loss": 0.4092, + "step": 996 + }, + { + "epoch": 0.13983169705469845, + "grad_norm": 2.6173824408509363, + "learning_rate": 9.687053538864037e-06, + "loss": 0.3904, + "step": 997 + }, + { + "epoch": 0.1399719495091164, + "grad_norm": 4.421298471346341, + "learning_rate": 9.686262147699507e-06, + "loss": 0.4033, + "step": 998 + }, + { + "epoch": 0.14011220196353436, + "grad_norm": 2.6223914491426754, + "learning_rate": 9.685469789557522e-06, + "loss": 0.369, + "step": 999 + }, + { + "epoch": 0.1402524544179523, + "grad_norm": 20.335259184807093, + "learning_rate": 9.684676464601583e-06, + "loss": 0.3867, + "step": 1000 + }, + { + "epoch": 0.14039270687237027, + "grad_norm": 2.5383140047589405, + "learning_rate": 9.683882172995385e-06, + "loss": 0.4046, + "step": 1001 + }, + { + "epoch": 0.14053295932678822, + "grad_norm": 3.27502432078875, + "learning_rate": 9.683086914902825e-06, + "loss": 0.4237, + "step": 1002 + }, + { + "epoch": 0.14067321178120618, + "grad_norm": 3.0193039980076954, + "learning_rate": 9.682290690487997e-06, + "loss": 0.4615, + "step": 1003 + }, + { + "epoch": 0.14081346423562413, + "grad_norm": 2.327672619999904, + "learning_rate": 9.681493499915198e-06, + "loss": 0.4194, + "step": 1004 + }, + { + "epoch": 0.14095371669004209, + "grad_norm": 2.625620450762989, + "learning_rate": 9.680695343348923e-06, + "loss": 0.444, + "step": 1005 + }, + { + "epoch": 0.14109396914446004, + "grad_norm": 3.7116859082901015, + "learning_rate": 9.679896220953866e-06, + "loss": 0.401, + "step": 1006 + }, + { + "epoch": 0.14123422159887797, + "grad_norm": 2.67865005741062, + "learning_rate": 9.679096132894922e-06, + "loss": 0.3901, + "step": 1007 + }, + { + "epoch": 0.14137447405329592, + "grad_norm": 2.8502717741833923, + "learning_rate": 9.678295079337182e-06, + "loss": 0.3979, + "step": 1008 + }, + { + "epoch": 0.14151472650771388, + "grad_norm": 2.3008797953150473, + "learning_rate": 9.677493060445936e-06, + "loss": 0.4327, + "step": 1009 + }, + { + "epoch": 0.14165497896213183, + "grad_norm": 2.4525737750577106, + "learning_rate": 9.676690076386674e-06, + "loss": 0.4726, + "step": 1010 + }, + { + "epoch": 0.1417952314165498, + "grad_norm": 3.1129769017348754, + "learning_rate": 9.675886127325091e-06, + "loss": 0.3784, + "step": 1011 + }, + { + "epoch": 0.14193548387096774, + "grad_norm": 2.9323719797377805, + "learning_rate": 9.675081213427076e-06, + "loss": 0.417, + "step": 1012 + }, + { + "epoch": 0.1420757363253857, + "grad_norm": 3.341162277423039, + "learning_rate": 9.674275334858712e-06, + "loss": 0.3979, + "step": 1013 + }, + { + "epoch": 0.14221598877980365, + "grad_norm": 2.6588434461376877, + "learning_rate": 9.673468491786291e-06, + "loss": 0.4255, + "step": 1014 + }, + { + "epoch": 0.1423562412342216, + "grad_norm": 2.739230134385568, + "learning_rate": 9.672660684376298e-06, + "loss": 0.424, + "step": 1015 + }, + { + "epoch": 0.14249649368863956, + "grad_norm": 4.256385942212301, + "learning_rate": 9.67185191279542e-06, + "loss": 0.4094, + "step": 1016 + }, + { + "epoch": 0.1426367461430575, + "grad_norm": 2.5727243905031387, + "learning_rate": 9.671042177210539e-06, + "loss": 0.4388, + "step": 1017 + }, + { + "epoch": 0.14277699859747545, + "grad_norm": 2.187796436387607, + "learning_rate": 9.670231477788738e-06, + "loss": 0.4022, + "step": 1018 + }, + { + "epoch": 0.1429172510518934, + "grad_norm": 2.4363492547960393, + "learning_rate": 9.669419814697303e-06, + "loss": 0.3693, + "step": 1019 + }, + { + "epoch": 0.14305750350631136, + "grad_norm": 2.7023554241174192, + "learning_rate": 9.668607188103708e-06, + "loss": 0.4447, + "step": 1020 + }, + { + "epoch": 0.1431977559607293, + "grad_norm": 2.8140796887488646, + "learning_rate": 9.667793598175641e-06, + "loss": 0.4771, + "step": 1021 + }, + { + "epoch": 0.14333800841514727, + "grad_norm": 2.7237793454554082, + "learning_rate": 9.666979045080977e-06, + "loss": 0.3448, + "step": 1022 + }, + { + "epoch": 0.14347826086956522, + "grad_norm": 2.3018341504961604, + "learning_rate": 9.666163528987793e-06, + "loss": 0.3679, + "step": 1023 + }, + { + "epoch": 0.14361851332398318, + "grad_norm": 2.617725373979371, + "learning_rate": 9.665347050064362e-06, + "loss": 0.3945, + "step": 1024 + }, + { + "epoch": 0.14375876577840113, + "grad_norm": 2.428341508047377, + "learning_rate": 9.664529608479165e-06, + "loss": 0.38, + "step": 1025 + }, + { + "epoch": 0.1438990182328191, + "grad_norm": 3.367756594119279, + "learning_rate": 9.663711204400872e-06, + "loss": 0.4291, + "step": 1026 + }, + { + "epoch": 0.14403927068723701, + "grad_norm": 2.8160973325239986, + "learning_rate": 9.662891837998354e-06, + "loss": 0.382, + "step": 1027 + }, + { + "epoch": 0.14417952314165497, + "grad_norm": 2.455965960437588, + "learning_rate": 9.662071509440683e-06, + "loss": 0.3799, + "step": 1028 + }, + { + "epoch": 0.14431977559607292, + "grad_norm": 2.5339836147654746, + "learning_rate": 9.661250218897129e-06, + "loss": 0.4704, + "step": 1029 + }, + { + "epoch": 0.14446002805049088, + "grad_norm": 4.8754043411417705, + "learning_rate": 9.660427966537157e-06, + "loss": 0.4086, + "step": 1030 + }, + { + "epoch": 0.14460028050490883, + "grad_norm": 2.761186216557979, + "learning_rate": 9.659604752530434e-06, + "loss": 0.3941, + "step": 1031 + }, + { + "epoch": 0.1447405329593268, + "grad_norm": 1.9542255200251777, + "learning_rate": 9.658780577046826e-06, + "loss": 0.3949, + "step": 1032 + }, + { + "epoch": 0.14488078541374474, + "grad_norm": 2.252739615952699, + "learning_rate": 9.657955440256396e-06, + "loss": 0.4001, + "step": 1033 + }, + { + "epoch": 0.1450210378681627, + "grad_norm": 2.7491462422614648, + "learning_rate": 9.657129342329403e-06, + "loss": 0.4347, + "step": 1034 + }, + { + "epoch": 0.14516129032258066, + "grad_norm": 1.9876620484513357, + "learning_rate": 9.656302283436306e-06, + "loss": 0.4149, + "step": 1035 + }, + { + "epoch": 0.1453015427769986, + "grad_norm": 2.0437005739914085, + "learning_rate": 9.655474263747765e-06, + "loss": 0.3812, + "step": 1036 + }, + { + "epoch": 0.14544179523141654, + "grad_norm": 2.3959405184003217, + "learning_rate": 9.654645283434636e-06, + "loss": 0.4127, + "step": 1037 + }, + { + "epoch": 0.1455820476858345, + "grad_norm": 1.950102030701282, + "learning_rate": 9.653815342667973e-06, + "loss": 0.3966, + "step": 1038 + }, + { + "epoch": 0.14572230014025245, + "grad_norm": 2.8776803169128526, + "learning_rate": 9.652984441619028e-06, + "loss": 0.391, + "step": 1039 + }, + { + "epoch": 0.1458625525946704, + "grad_norm": 2.58316619906647, + "learning_rate": 9.65215258045925e-06, + "loss": 0.3968, + "step": 1040 + }, + { + "epoch": 0.14600280504908836, + "grad_norm": 2.1810511763496523, + "learning_rate": 9.651319759360293e-06, + "loss": 0.4204, + "step": 1041 + }, + { + "epoch": 0.1461430575035063, + "grad_norm": 2.0801372680852457, + "learning_rate": 9.650485978493998e-06, + "loss": 0.4226, + "step": 1042 + }, + { + "epoch": 0.14628330995792427, + "grad_norm": 2.943572241079221, + "learning_rate": 9.649651238032412e-06, + "loss": 0.4113, + "step": 1043 + }, + { + "epoch": 0.14642356241234222, + "grad_norm": 2.073304515446183, + "learning_rate": 9.64881553814778e-06, + "loss": 0.4012, + "step": 1044 + }, + { + "epoch": 0.14656381486676018, + "grad_norm": 2.465948325885005, + "learning_rate": 9.647978879012539e-06, + "loss": 0.4245, + "step": 1045 + }, + { + "epoch": 0.14670406732117813, + "grad_norm": 2.030857576463728, + "learning_rate": 9.64714126079933e-06, + "loss": 0.3889, + "step": 1046 + }, + { + "epoch": 0.14684431977559606, + "grad_norm": 2.376926945437948, + "learning_rate": 9.64630268368099e-06, + "loss": 0.4391, + "step": 1047 + }, + { + "epoch": 0.14698457223001402, + "grad_norm": 2.701034354834928, + "learning_rate": 9.645463147830551e-06, + "loss": 0.4072, + "step": 1048 + }, + { + "epoch": 0.14712482468443197, + "grad_norm": 3.014489152689817, + "learning_rate": 9.644622653421249e-06, + "loss": 0.4145, + "step": 1049 + }, + { + "epoch": 0.14726507713884993, + "grad_norm": 3.4944008591168783, + "learning_rate": 9.643781200626512e-06, + "loss": 0.4052, + "step": 1050 + }, + { + "epoch": 0.14740532959326788, + "grad_norm": 2.886312701377284, + "learning_rate": 9.64293878961997e-06, + "loss": 0.4213, + "step": 1051 + }, + { + "epoch": 0.14754558204768584, + "grad_norm": 3.3666889857191293, + "learning_rate": 9.642095420575443e-06, + "loss": 0.4341, + "step": 1052 + }, + { + "epoch": 0.1476858345021038, + "grad_norm": 2.549141092196029, + "learning_rate": 9.641251093666961e-06, + "loss": 0.4339, + "step": 1053 + }, + { + "epoch": 0.14782608695652175, + "grad_norm": 2.1635618215940524, + "learning_rate": 9.640405809068743e-06, + "loss": 0.3898, + "step": 1054 + }, + { + "epoch": 0.1479663394109397, + "grad_norm": 1.9952349522994326, + "learning_rate": 9.639559566955204e-06, + "loss": 0.4109, + "step": 1055 + }, + { + "epoch": 0.14810659186535766, + "grad_norm": 2.473388467038814, + "learning_rate": 9.638712367500964e-06, + "loss": 0.4313, + "step": 1056 + }, + { + "epoch": 0.14824684431977558, + "grad_norm": 2.72466319041275, + "learning_rate": 9.637864210880836e-06, + "loss": 0.4333, + "step": 1057 + }, + { + "epoch": 0.14838709677419354, + "grad_norm": 2.7842817605587578, + "learning_rate": 9.63701509726983e-06, + "loss": 0.4199, + "step": 1058 + }, + { + "epoch": 0.1485273492286115, + "grad_norm": 3.424883908719828, + "learning_rate": 9.636165026843155e-06, + "loss": 0.4466, + "step": 1059 + }, + { + "epoch": 0.14866760168302945, + "grad_norm": 2.0427576251374937, + "learning_rate": 9.63531399977622e-06, + "loss": 0.4165, + "step": 1060 + }, + { + "epoch": 0.1488078541374474, + "grad_norm": 4.31985907218794, + "learning_rate": 9.634462016244625e-06, + "loss": 0.3888, + "step": 1061 + }, + { + "epoch": 0.14894810659186536, + "grad_norm": 4.148297955883368, + "learning_rate": 9.633609076424171e-06, + "loss": 0.4666, + "step": 1062 + }, + { + "epoch": 0.14908835904628331, + "grad_norm": 2.4568796763039598, + "learning_rate": 9.632755180490858e-06, + "loss": 0.4003, + "step": 1063 + }, + { + "epoch": 0.14922861150070127, + "grad_norm": 2.5828160003804546, + "learning_rate": 9.63190032862088e-06, + "loss": 0.3755, + "step": 1064 + }, + { + "epoch": 0.14936886395511922, + "grad_norm": 2.8218962642168433, + "learning_rate": 9.631044520990628e-06, + "loss": 0.3939, + "step": 1065 + }, + { + "epoch": 0.14950911640953718, + "grad_norm": 2.357733755648507, + "learning_rate": 9.630187757776697e-06, + "loss": 0.4194, + "step": 1066 + }, + { + "epoch": 0.1496493688639551, + "grad_norm": 3.1452347815472677, + "learning_rate": 9.629330039155872e-06, + "loss": 0.3781, + "step": 1067 + }, + { + "epoch": 0.14978962131837306, + "grad_norm": 2.7153605236855913, + "learning_rate": 9.628471365305134e-06, + "loss": 0.4333, + "step": 1068 + }, + { + "epoch": 0.14992987377279102, + "grad_norm": 2.4797130484848298, + "learning_rate": 9.627611736401668e-06, + "loss": 0.4175, + "step": 1069 + }, + { + "epoch": 0.15007012622720897, + "grad_norm": 2.3284972232526, + "learning_rate": 9.62675115262285e-06, + "loss": 0.4352, + "step": 1070 + }, + { + "epoch": 0.15021037868162693, + "grad_norm": 2.3642635605374673, + "learning_rate": 9.625889614146258e-06, + "loss": 0.439, + "step": 1071 + }, + { + "epoch": 0.15035063113604488, + "grad_norm": 3.048220137018182, + "learning_rate": 9.625027121149665e-06, + "loss": 0.4081, + "step": 1072 + }, + { + "epoch": 0.15049088359046284, + "grad_norm": 2.7320320772993902, + "learning_rate": 9.624163673811036e-06, + "loss": 0.4282, + "step": 1073 + }, + { + "epoch": 0.1506311360448808, + "grad_norm": 2.0980072012051325, + "learning_rate": 9.62329927230854e-06, + "loss": 0.405, + "step": 1074 + }, + { + "epoch": 0.15077138849929875, + "grad_norm": 2.2985592849384924, + "learning_rate": 9.622433916820539e-06, + "loss": 0.4004, + "step": 1075 + }, + { + "epoch": 0.1509116409537167, + "grad_norm": 2.4278306151730766, + "learning_rate": 9.621567607525597e-06, + "loss": 0.4195, + "step": 1076 + }, + { + "epoch": 0.15105189340813463, + "grad_norm": 2.180788589249815, + "learning_rate": 9.620700344602465e-06, + "loss": 0.444, + "step": 1077 + }, + { + "epoch": 0.15119214586255258, + "grad_norm": 3.0347592205834886, + "learning_rate": 9.619832128230102e-06, + "loss": 0.4117, + "step": 1078 + }, + { + "epoch": 0.15133239831697054, + "grad_norm": 1.9319990267227911, + "learning_rate": 9.618962958587656e-06, + "loss": 0.3891, + "step": 1079 + }, + { + "epoch": 0.1514726507713885, + "grad_norm": 2.2633574615488334, + "learning_rate": 9.618092835854474e-06, + "loss": 0.3929, + "step": 1080 + }, + { + "epoch": 0.15161290322580645, + "grad_norm": 2.3929805744114385, + "learning_rate": 9.617221760210097e-06, + "loss": 0.4334, + "step": 1081 + }, + { + "epoch": 0.1517531556802244, + "grad_norm": 2.5438583388443448, + "learning_rate": 9.616349731834271e-06, + "loss": 0.3803, + "step": 1082 + }, + { + "epoch": 0.15189340813464236, + "grad_norm": 2.8384846999492006, + "learning_rate": 9.61547675090693e-06, + "loss": 0.4036, + "step": 1083 + }, + { + "epoch": 0.15203366058906032, + "grad_norm": 3.2382551866317697, + "learning_rate": 9.614602817608207e-06, + "loss": 0.4007, + "step": 1084 + }, + { + "epoch": 0.15217391304347827, + "grad_norm": 2.4687658381859436, + "learning_rate": 9.613727932118435e-06, + "loss": 0.3984, + "step": 1085 + }, + { + "epoch": 0.15231416549789623, + "grad_norm": 2.6900388568780325, + "learning_rate": 9.612852094618135e-06, + "loss": 0.439, + "step": 1086 + }, + { + "epoch": 0.15245441795231415, + "grad_norm": 2.5914035892559166, + "learning_rate": 9.611975305288035e-06, + "loss": 0.3796, + "step": 1087 + }, + { + "epoch": 0.1525946704067321, + "grad_norm": 2.7013164489848562, + "learning_rate": 9.611097564309054e-06, + "loss": 0.4026, + "step": 1088 + }, + { + "epoch": 0.15273492286115006, + "grad_norm": 2.3553114501659254, + "learning_rate": 9.610218871862303e-06, + "loss": 0.4008, + "step": 1089 + }, + { + "epoch": 0.15287517531556802, + "grad_norm": 2.2941843479120396, + "learning_rate": 9.609339228129098e-06, + "loss": 0.4124, + "step": 1090 + }, + { + "epoch": 0.15301542776998597, + "grad_norm": 3.0215582317653946, + "learning_rate": 9.608458633290949e-06, + "loss": 0.4124, + "step": 1091 + }, + { + "epoch": 0.15315568022440393, + "grad_norm": 2.297298055081629, + "learning_rate": 9.607577087529555e-06, + "loss": 0.3776, + "step": 1092 + }, + { + "epoch": 0.15329593267882188, + "grad_norm": 2.5290971242612517, + "learning_rate": 9.606694591026823e-06, + "loss": 0.4053, + "step": 1093 + }, + { + "epoch": 0.15343618513323984, + "grad_norm": 2.707537450980487, + "learning_rate": 9.605811143964846e-06, + "loss": 0.4241, + "step": 1094 + }, + { + "epoch": 0.1535764375876578, + "grad_norm": 2.0348920595752285, + "learning_rate": 9.604926746525918e-06, + "loss": 0.4216, + "step": 1095 + }, + { + "epoch": 0.15371669004207575, + "grad_norm": 3.602553886947677, + "learning_rate": 9.604041398892528e-06, + "loss": 0.4163, + "step": 1096 + }, + { + "epoch": 0.15385694249649368, + "grad_norm": 2.5830213809603455, + "learning_rate": 9.603155101247363e-06, + "loss": 0.4138, + "step": 1097 + }, + { + "epoch": 0.15399719495091163, + "grad_norm": 5.650148420156749, + "learning_rate": 9.602267853773301e-06, + "loss": 0.4264, + "step": 1098 + }, + { + "epoch": 0.1541374474053296, + "grad_norm": 2.5522059716062664, + "learning_rate": 9.601379656653424e-06, + "loss": 0.4532, + "step": 1099 + }, + { + "epoch": 0.15427769985974754, + "grad_norm": 2.1496229195934924, + "learning_rate": 9.600490510071001e-06, + "loss": 0.421, + "step": 1100 + }, + { + "epoch": 0.1544179523141655, + "grad_norm": 1.9552626241694175, + "learning_rate": 9.599600414209503e-06, + "loss": 0.443, + "step": 1101 + }, + { + "epoch": 0.15455820476858345, + "grad_norm": 2.4666002788717067, + "learning_rate": 9.598709369252595e-06, + "loss": 0.4381, + "step": 1102 + }, + { + "epoch": 0.1546984572230014, + "grad_norm": 2.385906274737349, + "learning_rate": 9.597817375384138e-06, + "loss": 0.384, + "step": 1103 + }, + { + "epoch": 0.15483870967741936, + "grad_norm": 2.7324615759211506, + "learning_rate": 9.596924432788188e-06, + "loss": 0.409, + "step": 1104 + }, + { + "epoch": 0.15497896213183732, + "grad_norm": 2.740828565912536, + "learning_rate": 9.596030541648999e-06, + "loss": 0.3933, + "step": 1105 + }, + { + "epoch": 0.15511921458625527, + "grad_norm": 1.8453785444387232, + "learning_rate": 9.595135702151017e-06, + "loss": 0.4253, + "step": 1106 + }, + { + "epoch": 0.1552594670406732, + "grad_norm": 2.492595675056487, + "learning_rate": 9.594239914478886e-06, + "loss": 0.3938, + "step": 1107 + }, + { + "epoch": 0.15539971949509115, + "grad_norm": 2.3764125237428178, + "learning_rate": 9.593343178817448e-06, + "loss": 0.4503, + "step": 1108 + }, + { + "epoch": 0.1555399719495091, + "grad_norm": 2.2884922814600546, + "learning_rate": 9.592445495351738e-06, + "loss": 0.3992, + "step": 1109 + }, + { + "epoch": 0.15568022440392706, + "grad_norm": 4.806067787952794, + "learning_rate": 9.591546864266983e-06, + "loss": 0.4033, + "step": 1110 + }, + { + "epoch": 0.15582047685834502, + "grad_norm": 2.567340465964004, + "learning_rate": 9.590647285748614e-06, + "loss": 0.4139, + "step": 1111 + }, + { + "epoch": 0.15596072931276297, + "grad_norm": 3.9090763700670523, + "learning_rate": 9.589746759982248e-06, + "loss": 0.3944, + "step": 1112 + }, + { + "epoch": 0.15610098176718093, + "grad_norm": 2.72169116304878, + "learning_rate": 9.588845287153705e-06, + "loss": 0.4031, + "step": 1113 + }, + { + "epoch": 0.15624123422159888, + "grad_norm": 2.0091026109662993, + "learning_rate": 9.587942867448998e-06, + "loss": 0.3521, + "step": 1114 + }, + { + "epoch": 0.15638148667601684, + "grad_norm": 2.230409851899958, + "learning_rate": 9.587039501054335e-06, + "loss": 0.4059, + "step": 1115 + }, + { + "epoch": 0.1565217391304348, + "grad_norm": 2.8783031230402103, + "learning_rate": 9.586135188156116e-06, + "loss": 0.453, + "step": 1116 + }, + { + "epoch": 0.15666199158485272, + "grad_norm": 2.1051851835366935, + "learning_rate": 9.585229928940944e-06, + "loss": 0.4069, + "step": 1117 + }, + { + "epoch": 0.15680224403927068, + "grad_norm": 2.1059716920897347, + "learning_rate": 9.584323723595612e-06, + "loss": 0.3927, + "step": 1118 + }, + { + "epoch": 0.15694249649368863, + "grad_norm": 2.0551007801598855, + "learning_rate": 9.583416572307107e-06, + "loss": 0.41, + "step": 1119 + }, + { + "epoch": 0.1570827489481066, + "grad_norm": 2.5229899951496044, + "learning_rate": 9.582508475262615e-06, + "loss": 0.4201, + "step": 1120 + }, + { + "epoch": 0.15722300140252454, + "grad_norm": 2.3618850997704826, + "learning_rate": 9.581599432649515e-06, + "loss": 0.3849, + "step": 1121 + }, + { + "epoch": 0.1573632538569425, + "grad_norm": 2.4202884522653885, + "learning_rate": 9.580689444655381e-06, + "loss": 0.4441, + "step": 1122 + }, + { + "epoch": 0.15750350631136045, + "grad_norm": 2.818625920645712, + "learning_rate": 9.579778511467985e-06, + "loss": 0.4273, + "step": 1123 + }, + { + "epoch": 0.1576437587657784, + "grad_norm": 2.712378830554588, + "learning_rate": 9.578866633275289e-06, + "loss": 0.3783, + "step": 1124 + }, + { + "epoch": 0.15778401122019636, + "grad_norm": 2.326053901454548, + "learning_rate": 9.577953810265453e-06, + "loss": 0.4356, + "step": 1125 + }, + { + "epoch": 0.15792426367461432, + "grad_norm": 2.3586008178564604, + "learning_rate": 9.577040042626832e-06, + "loss": 0.432, + "step": 1126 + }, + { + "epoch": 0.15806451612903225, + "grad_norm": 2.124613221816073, + "learning_rate": 9.576125330547977e-06, + "loss": 0.4262, + "step": 1127 + }, + { + "epoch": 0.1582047685834502, + "grad_norm": 3.2891273039099116, + "learning_rate": 9.575209674217632e-06, + "loss": 0.4362, + "step": 1128 + }, + { + "epoch": 0.15834502103786816, + "grad_norm": 2.9829424070961768, + "learning_rate": 9.574293073824734e-06, + "loss": 0.3821, + "step": 1129 + }, + { + "epoch": 0.1584852734922861, + "grad_norm": 2.1867071622300935, + "learning_rate": 9.57337552955842e-06, + "loss": 0.4005, + "step": 1130 + }, + { + "epoch": 0.15862552594670407, + "grad_norm": 3.400795865876443, + "learning_rate": 9.572457041608018e-06, + "loss": 0.4295, + "step": 1131 + }, + { + "epoch": 0.15876577840112202, + "grad_norm": 2.818576606103275, + "learning_rate": 9.57153761016305e-06, + "loss": 0.466, + "step": 1132 + }, + { + "epoch": 0.15890603085553998, + "grad_norm": 2.3378325975398395, + "learning_rate": 9.570617235413235e-06, + "loss": 0.4219, + "step": 1133 + }, + { + "epoch": 0.15904628330995793, + "grad_norm": 3.5590919120819935, + "learning_rate": 9.569695917548488e-06, + "loss": 0.4149, + "step": 1134 + }, + { + "epoch": 0.15918653576437589, + "grad_norm": 2.6004553836221596, + "learning_rate": 9.568773656758913e-06, + "loss": 0.4164, + "step": 1135 + }, + { + "epoch": 0.15932678821879384, + "grad_norm": 1.9978827952622087, + "learning_rate": 9.567850453234816e-06, + "loss": 0.3767, + "step": 1136 + }, + { + "epoch": 0.15946704067321177, + "grad_norm": 2.1536955145410523, + "learning_rate": 9.56692630716669e-06, + "loss": 0.3865, + "step": 1137 + }, + { + "epoch": 0.15960729312762972, + "grad_norm": 2.4451260724162602, + "learning_rate": 9.56600121874523e-06, + "loss": 0.4038, + "step": 1138 + }, + { + "epoch": 0.15974754558204768, + "grad_norm": 2.6160236560731867, + "learning_rate": 9.565075188161316e-06, + "loss": 0.4032, + "step": 1139 + }, + { + "epoch": 0.15988779803646563, + "grad_norm": 2.326345288325165, + "learning_rate": 9.564148215606033e-06, + "loss": 0.4153, + "step": 1140 + }, + { + "epoch": 0.1600280504908836, + "grad_norm": 2.2909924179944072, + "learning_rate": 9.563220301270652e-06, + "loss": 0.4402, + "step": 1141 + }, + { + "epoch": 0.16016830294530154, + "grad_norm": 3.021430559424996, + "learning_rate": 9.562291445346642e-06, + "loss": 0.3899, + "step": 1142 + }, + { + "epoch": 0.1603085553997195, + "grad_norm": 2.446424432538204, + "learning_rate": 9.561361648025671e-06, + "loss": 0.4103, + "step": 1143 + }, + { + "epoch": 0.16044880785413745, + "grad_norm": 3.003415230055348, + "learning_rate": 9.560430909499589e-06, + "loss": 0.3539, + "step": 1144 + }, + { + "epoch": 0.1605890603085554, + "grad_norm": 3.0235065777598096, + "learning_rate": 9.55949922996045e-06, + "loss": 0.4177, + "step": 1145 + }, + { + "epoch": 0.16072931276297336, + "grad_norm": 2.534913602481532, + "learning_rate": 9.558566609600502e-06, + "loss": 0.4516, + "step": 1146 + }, + { + "epoch": 0.1608695652173913, + "grad_norm": 2.100353815154539, + "learning_rate": 9.557633048612183e-06, + "loss": 0.3994, + "step": 1147 + }, + { + "epoch": 0.16100981767180925, + "grad_norm": 1.9203085990102577, + "learning_rate": 9.556698547188126e-06, + "loss": 0.398, + "step": 1148 + }, + { + "epoch": 0.1611500701262272, + "grad_norm": 1.89492950972649, + "learning_rate": 9.555763105521159e-06, + "loss": 0.4365, + "step": 1149 + }, + { + "epoch": 0.16129032258064516, + "grad_norm": 2.7139931643012707, + "learning_rate": 9.554826723804304e-06, + "loss": 0.4431, + "step": 1150 + }, + { + "epoch": 0.1614305750350631, + "grad_norm": 2.560696036666536, + "learning_rate": 9.553889402230776e-06, + "loss": 0.3947, + "step": 1151 + }, + { + "epoch": 0.16157082748948107, + "grad_norm": 1.7362460791788672, + "learning_rate": 9.55295114099399e-06, + "loss": 0.3914, + "step": 1152 + }, + { + "epoch": 0.16171107994389902, + "grad_norm": 3.732927624478831, + "learning_rate": 9.55201194028754e-06, + "loss": 0.3865, + "step": 1153 + }, + { + "epoch": 0.16185133239831698, + "grad_norm": 2.6886451410607464, + "learning_rate": 9.551071800305233e-06, + "loss": 0.3904, + "step": 1154 + }, + { + "epoch": 0.16199158485273493, + "grad_norm": 2.379096864476931, + "learning_rate": 9.550130721241056e-06, + "loss": 0.4546, + "step": 1155 + }, + { + "epoch": 0.1621318373071529, + "grad_norm": 2.461502736891504, + "learning_rate": 9.549188703289192e-06, + "loss": 0.3657, + "step": 1156 + }, + { + "epoch": 0.16227208976157081, + "grad_norm": 2.2762197138255154, + "learning_rate": 9.548245746644025e-06, + "loss": 0.4343, + "step": 1157 + }, + { + "epoch": 0.16241234221598877, + "grad_norm": 2.546500242780415, + "learning_rate": 9.547301851500123e-06, + "loss": 0.428, + "step": 1158 + }, + { + "epoch": 0.16255259467040672, + "grad_norm": 2.5021262663610164, + "learning_rate": 9.546357018052254e-06, + "loss": 0.3946, + "step": 1159 + }, + { + "epoch": 0.16269284712482468, + "grad_norm": 1.9582843842036564, + "learning_rate": 9.545411246495377e-06, + "loss": 0.4227, + "step": 1160 + }, + { + "epoch": 0.16283309957924264, + "grad_norm": 2.359148128288013, + "learning_rate": 9.544464537024648e-06, + "loss": 0.4191, + "step": 1161 + }, + { + "epoch": 0.1629733520336606, + "grad_norm": 2.4439744304811586, + "learning_rate": 9.54351688983541e-06, + "loss": 0.3898, + "step": 1162 + }, + { + "epoch": 0.16311360448807855, + "grad_norm": 2.6416373936167408, + "learning_rate": 9.542568305123207e-06, + "loss": 0.4311, + "step": 1163 + }, + { + "epoch": 0.1632538569424965, + "grad_norm": 2.2498189005554714, + "learning_rate": 9.54161878308377e-06, + "loss": 0.3881, + "step": 1164 + }, + { + "epoch": 0.16339410939691446, + "grad_norm": 1.9983997499481057, + "learning_rate": 9.54066832391303e-06, + "loss": 0.3777, + "step": 1165 + }, + { + "epoch": 0.1635343618513324, + "grad_norm": 2.5831920421067824, + "learning_rate": 9.539716927807102e-06, + "loss": 0.3733, + "step": 1166 + }, + { + "epoch": 0.16367461430575034, + "grad_norm": 1.7356110713126478, + "learning_rate": 9.538764594962302e-06, + "loss": 0.4383, + "step": 1167 + }, + { + "epoch": 0.1638148667601683, + "grad_norm": 2.2847990881741733, + "learning_rate": 9.537811325575142e-06, + "loss": 0.4128, + "step": 1168 + }, + { + "epoch": 0.16395511921458625, + "grad_norm": 2.7139922858222056, + "learning_rate": 9.536857119842315e-06, + "loss": 0.3244, + "step": 1169 + }, + { + "epoch": 0.1640953716690042, + "grad_norm": 2.658353443926694, + "learning_rate": 9.53590197796072e-06, + "loss": 0.4127, + "step": 1170 + }, + { + "epoch": 0.16423562412342216, + "grad_norm": 2.43866222745694, + "learning_rate": 9.534945900127441e-06, + "loss": 0.3763, + "step": 1171 + }, + { + "epoch": 0.1643758765778401, + "grad_norm": 2.5244907507693934, + "learning_rate": 9.533988886539761e-06, + "loss": 0.4239, + "step": 1172 + }, + { + "epoch": 0.16451612903225807, + "grad_norm": 2.520608550419327, + "learning_rate": 9.533030937395151e-06, + "loss": 0.3735, + "step": 1173 + }, + { + "epoch": 0.16465638148667602, + "grad_norm": 2.3409335698247653, + "learning_rate": 9.532072052891276e-06, + "loss": 0.3895, + "step": 1174 + }, + { + "epoch": 0.16479663394109398, + "grad_norm": 2.674189763938731, + "learning_rate": 9.531112233225998e-06, + "loss": 0.3949, + "step": 1175 + }, + { + "epoch": 0.16493688639551193, + "grad_norm": 1.9253873905921681, + "learning_rate": 9.530151478597366e-06, + "loss": 0.4235, + "step": 1176 + }, + { + "epoch": 0.16507713884992986, + "grad_norm": 3.5982233484824095, + "learning_rate": 9.529189789203628e-06, + "loss": 0.4053, + "step": 1177 + }, + { + "epoch": 0.16521739130434782, + "grad_norm": 4.013193307083466, + "learning_rate": 9.52822716524322e-06, + "loss": 0.4376, + "step": 1178 + }, + { + "epoch": 0.16535764375876577, + "grad_norm": 2.991055824873512, + "learning_rate": 9.527263606914772e-06, + "loss": 0.3655, + "step": 1179 + }, + { + "epoch": 0.16549789621318373, + "grad_norm": 3.578869471499451, + "learning_rate": 9.526299114417108e-06, + "loss": 0.3989, + "step": 1180 + }, + { + "epoch": 0.16563814866760168, + "grad_norm": 2.96286889849689, + "learning_rate": 9.525333687949247e-06, + "loss": 0.435, + "step": 1181 + }, + { + "epoch": 0.16577840112201964, + "grad_norm": 1.7487858238668246, + "learning_rate": 9.524367327710396e-06, + "loss": 0.4205, + "step": 1182 + }, + { + "epoch": 0.1659186535764376, + "grad_norm": 2.627491994277561, + "learning_rate": 9.523400033899957e-06, + "loss": 0.4342, + "step": 1183 + }, + { + "epoch": 0.16605890603085555, + "grad_norm": 2.5171275893599323, + "learning_rate": 9.522431806717523e-06, + "loss": 0.4001, + "step": 1184 + }, + { + "epoch": 0.1661991584852735, + "grad_norm": 3.483885863316731, + "learning_rate": 9.52146264636288e-06, + "loss": 0.4341, + "step": 1185 + }, + { + "epoch": 0.16633941093969146, + "grad_norm": 2.3925726442858144, + "learning_rate": 9.520492553036012e-06, + "loss": 0.3603, + "step": 1186 + }, + { + "epoch": 0.16647966339410938, + "grad_norm": 2.4015746831740015, + "learning_rate": 9.519521526937087e-06, + "loss": 0.4177, + "step": 1187 + }, + { + "epoch": 0.16661991584852734, + "grad_norm": 2.448855043020823, + "learning_rate": 9.518549568266474e-06, + "loss": 0.4601, + "step": 1188 + }, + { + "epoch": 0.1667601683029453, + "grad_norm": 2.044396217210435, + "learning_rate": 9.517576677224723e-06, + "loss": 0.436, + "step": 1189 + }, + { + "epoch": 0.16690042075736325, + "grad_norm": 2.4423940384525595, + "learning_rate": 9.516602854012587e-06, + "loss": 0.3716, + "step": 1190 + }, + { + "epoch": 0.1670406732117812, + "grad_norm": 1.9781591427524672, + "learning_rate": 9.515628098831009e-06, + "loss": 0.3487, + "step": 1191 + }, + { + "epoch": 0.16718092566619916, + "grad_norm": 2.1303713735352496, + "learning_rate": 9.514652411881122e-06, + "loss": 0.4224, + "step": 1192 + }, + { + "epoch": 0.16732117812061711, + "grad_norm": 2.2392595160536475, + "learning_rate": 9.51367579336425e-06, + "loss": 0.4361, + "step": 1193 + }, + { + "epoch": 0.16746143057503507, + "grad_norm": 1.9880941186765244, + "learning_rate": 9.512698243481914e-06, + "loss": 0.3987, + "step": 1194 + }, + { + "epoch": 0.16760168302945302, + "grad_norm": 2.2046618140418803, + "learning_rate": 9.511719762435822e-06, + "loss": 0.395, + "step": 1195 + }, + { + "epoch": 0.16774193548387098, + "grad_norm": 4.369878550572421, + "learning_rate": 9.51074035042788e-06, + "loss": 0.4797, + "step": 1196 + }, + { + "epoch": 0.1678821879382889, + "grad_norm": 1.816185656862949, + "learning_rate": 9.509760007660182e-06, + "loss": 0.389, + "step": 1197 + }, + { + "epoch": 0.16802244039270686, + "grad_norm": 2.4334654062389744, + "learning_rate": 9.508778734335013e-06, + "loss": 0.3873, + "step": 1198 + }, + { + "epoch": 0.16816269284712482, + "grad_norm": 2.1890452785209344, + "learning_rate": 9.507796530654854e-06, + "loss": 0.4145, + "step": 1199 + }, + { + "epoch": 0.16830294530154277, + "grad_norm": 3.2225224877542797, + "learning_rate": 9.506813396822373e-06, + "loss": 0.4182, + "step": 1200 + }, + { + "epoch": 0.16844319775596073, + "grad_norm": 1.96718074685317, + "learning_rate": 9.505829333040437e-06, + "loss": 0.4093, + "step": 1201 + }, + { + "epoch": 0.16858345021037868, + "grad_norm": 2.340145136252884, + "learning_rate": 9.504844339512096e-06, + "loss": 0.3925, + "step": 1202 + }, + { + "epoch": 0.16872370266479664, + "grad_norm": 1.9437631465553142, + "learning_rate": 9.503858416440602e-06, + "loss": 0.4138, + "step": 1203 + }, + { + "epoch": 0.1688639551192146, + "grad_norm": 1.9766972311186057, + "learning_rate": 9.502871564029386e-06, + "loss": 0.4369, + "step": 1204 + }, + { + "epoch": 0.16900420757363255, + "grad_norm": 1.729387897295848, + "learning_rate": 9.501883782482084e-06, + "loss": 0.4281, + "step": 1205 + }, + { + "epoch": 0.1691444600280505, + "grad_norm": 2.628780639731081, + "learning_rate": 9.500895072002517e-06, + "loss": 0.4124, + "step": 1206 + }, + { + "epoch": 0.16928471248246843, + "grad_norm": 7.411661352154686, + "learning_rate": 9.499905432794699e-06, + "loss": 0.405, + "step": 1207 + }, + { + "epoch": 0.16942496493688639, + "grad_norm": 2.783183165361544, + "learning_rate": 9.498914865062831e-06, + "loss": 0.4386, + "step": 1208 + }, + { + "epoch": 0.16956521739130434, + "grad_norm": 2.982545141884103, + "learning_rate": 9.497923369011312e-06, + "loss": 0.3758, + "step": 1209 + }, + { + "epoch": 0.1697054698457223, + "grad_norm": 2.345994307131417, + "learning_rate": 9.496930944844733e-06, + "loss": 0.3261, + "step": 1210 + }, + { + "epoch": 0.16984572230014025, + "grad_norm": 3.5091945584820152, + "learning_rate": 9.495937592767873e-06, + "loss": 0.4107, + "step": 1211 + }, + { + "epoch": 0.1699859747545582, + "grad_norm": 2.065666657689991, + "learning_rate": 9.494943312985698e-06, + "loss": 0.4038, + "step": 1212 + }, + { + "epoch": 0.17012622720897616, + "grad_norm": 2.5238550744140418, + "learning_rate": 9.493948105703376e-06, + "loss": 0.4376, + "step": 1213 + }, + { + "epoch": 0.17026647966339412, + "grad_norm": 4.103484715123352, + "learning_rate": 9.49295197112626e-06, + "loss": 0.4272, + "step": 1214 + }, + { + "epoch": 0.17040673211781207, + "grad_norm": 2.298970000135591, + "learning_rate": 9.491954909459895e-06, + "loss": 0.3997, + "step": 1215 + }, + { + "epoch": 0.17054698457223003, + "grad_norm": 4.247622329719606, + "learning_rate": 9.490956920910016e-06, + "loss": 0.3835, + "step": 1216 + }, + { + "epoch": 0.17068723702664795, + "grad_norm": 4.1277944174000885, + "learning_rate": 9.489958005682555e-06, + "loss": 0.4511, + "step": 1217 + }, + { + "epoch": 0.1708274894810659, + "grad_norm": 3.259215713661228, + "learning_rate": 9.488958163983629e-06, + "loss": 0.4328, + "step": 1218 + }, + { + "epoch": 0.17096774193548386, + "grad_norm": 2.9630929152371537, + "learning_rate": 9.487957396019547e-06, + "loss": 0.4259, + "step": 1219 + }, + { + "epoch": 0.17110799438990182, + "grad_norm": 2.0584259254335158, + "learning_rate": 9.486955701996811e-06, + "loss": 0.378, + "step": 1220 + }, + { + "epoch": 0.17124824684431977, + "grad_norm": 2.4670107545653037, + "learning_rate": 9.485953082122116e-06, + "loss": 0.4353, + "step": 1221 + }, + { + "epoch": 0.17138849929873773, + "grad_norm": 2.3662240362178717, + "learning_rate": 9.484949536602343e-06, + "loss": 0.4423, + "step": 1222 + }, + { + "epoch": 0.17152875175315568, + "grad_norm": 2.2723231779997004, + "learning_rate": 9.48394506564457e-06, + "loss": 0.3982, + "step": 1223 + }, + { + "epoch": 0.17166900420757364, + "grad_norm": 2.559165925556135, + "learning_rate": 9.482939669456056e-06, + "loss": 0.4306, + "step": 1224 + }, + { + "epoch": 0.1718092566619916, + "grad_norm": 2.6037049965121444, + "learning_rate": 9.481933348244264e-06, + "loss": 0.424, + "step": 1225 + }, + { + "epoch": 0.17194950911640955, + "grad_norm": 2.553673499217684, + "learning_rate": 9.480926102216836e-06, + "loss": 0.428, + "step": 1226 + }, + { + "epoch": 0.17208976157082748, + "grad_norm": 6.408181992402674, + "learning_rate": 9.479917931581616e-06, + "loss": 0.4529, + "step": 1227 + }, + { + "epoch": 0.17223001402524543, + "grad_norm": 2.448997183334332, + "learning_rate": 9.478908836546629e-06, + "loss": 0.3834, + "step": 1228 + }, + { + "epoch": 0.1723702664796634, + "grad_norm": 2.8029912638152146, + "learning_rate": 9.477898817320094e-06, + "loss": 0.4123, + "step": 1229 + }, + { + "epoch": 0.17251051893408134, + "grad_norm": 3.202817129830967, + "learning_rate": 9.476887874110426e-06, + "loss": 0.3825, + "step": 1230 + }, + { + "epoch": 0.1726507713884993, + "grad_norm": 2.8321866538719878, + "learning_rate": 9.475876007126222e-06, + "loss": 0.4591, + "step": 1231 + }, + { + "epoch": 0.17279102384291725, + "grad_norm": 2.4171645101133303, + "learning_rate": 9.474863216576276e-06, + "loss": 0.4224, + "step": 1232 + }, + { + "epoch": 0.1729312762973352, + "grad_norm": 7.168051504546139, + "learning_rate": 9.473849502669568e-06, + "loss": 0.3713, + "step": 1233 + }, + { + "epoch": 0.17307152875175316, + "grad_norm": 2.78534482769613, + "learning_rate": 9.472834865615271e-06, + "loss": 0.3697, + "step": 1234 + }, + { + "epoch": 0.17321178120617112, + "grad_norm": 2.8460343795812273, + "learning_rate": 9.47181930562275e-06, + "loss": 0.4576, + "step": 1235 + }, + { + "epoch": 0.17335203366058907, + "grad_norm": 2.5761921428493375, + "learning_rate": 9.470802822901558e-06, + "loss": 0.4131, + "step": 1236 + }, + { + "epoch": 0.173492286115007, + "grad_norm": 4.779160984416179, + "learning_rate": 9.469785417661439e-06, + "loss": 0.4242, + "step": 1237 + }, + { + "epoch": 0.17363253856942495, + "grad_norm": 2.2333713224605662, + "learning_rate": 9.468767090112328e-06, + "loss": 0.4347, + "step": 1238 + }, + { + "epoch": 0.1737727910238429, + "grad_norm": 2.323396322453706, + "learning_rate": 9.467747840464348e-06, + "loss": 0.4057, + "step": 1239 + }, + { + "epoch": 0.17391304347826086, + "grad_norm": 2.201717928401065, + "learning_rate": 9.466727668927817e-06, + "loss": 0.4482, + "step": 1240 + }, + { + "epoch": 0.17405329593267882, + "grad_norm": 3.6871073885602232, + "learning_rate": 9.465706575713235e-06, + "loss": 0.3794, + "step": 1241 + }, + { + "epoch": 0.17419354838709677, + "grad_norm": 2.4971196748213007, + "learning_rate": 9.464684561031306e-06, + "loss": 0.4636, + "step": 1242 + }, + { + "epoch": 0.17433380084151473, + "grad_norm": 3.0448723887495732, + "learning_rate": 9.463661625092907e-06, + "loss": 0.3944, + "step": 1243 + }, + { + "epoch": 0.17447405329593269, + "grad_norm": 2.6748980101634925, + "learning_rate": 9.462637768109119e-06, + "loss": 0.3785, + "step": 1244 + }, + { + "epoch": 0.17461430575035064, + "grad_norm": 3.436399942722793, + "learning_rate": 9.461612990291205e-06, + "loss": 0.4122, + "step": 1245 + }, + { + "epoch": 0.1747545582047686, + "grad_norm": 3.2400271652401527, + "learning_rate": 9.460587291850623e-06, + "loss": 0.3889, + "step": 1246 + }, + { + "epoch": 0.17489481065918652, + "grad_norm": 1.816902207786625, + "learning_rate": 9.459560672999016e-06, + "loss": 0.3969, + "step": 1247 + }, + { + "epoch": 0.17503506311360448, + "grad_norm": 1.916401146727593, + "learning_rate": 9.458533133948223e-06, + "loss": 0.4011, + "step": 1248 + }, + { + "epoch": 0.17517531556802243, + "grad_norm": 2.0131263326920577, + "learning_rate": 9.457504674910265e-06, + "loss": 0.3718, + "step": 1249 + }, + { + "epoch": 0.1753155680224404, + "grad_norm": 2.584319377701908, + "learning_rate": 9.45647529609736e-06, + "loss": 0.382, + "step": 1250 + }, + { + "epoch": 0.17545582047685834, + "grad_norm": 2.230159597801251, + "learning_rate": 9.455444997721916e-06, + "loss": 0.4054, + "step": 1251 + }, + { + "epoch": 0.1755960729312763, + "grad_norm": 2.29695378382502, + "learning_rate": 9.454413779996523e-06, + "loss": 0.4555, + "step": 1252 + }, + { + "epoch": 0.17573632538569425, + "grad_norm": 2.1711072044875364, + "learning_rate": 9.453381643133968e-06, + "loss": 0.4293, + "step": 1253 + }, + { + "epoch": 0.1758765778401122, + "grad_norm": 1.8950240605218354, + "learning_rate": 9.452348587347224e-06, + "loss": 0.4058, + "step": 1254 + }, + { + "epoch": 0.17601683029453016, + "grad_norm": 2.1408903973751197, + "learning_rate": 9.451314612849456e-06, + "loss": 0.444, + "step": 1255 + }, + { + "epoch": 0.17615708274894812, + "grad_norm": 1.8594666786993295, + "learning_rate": 9.450279719854016e-06, + "loss": 0.4166, + "step": 1256 + }, + { + "epoch": 0.17629733520336605, + "grad_norm": 1.9891962070879203, + "learning_rate": 9.44924390857445e-06, + "loss": 0.396, + "step": 1257 + }, + { + "epoch": 0.176437587657784, + "grad_norm": 2.8283465388238334, + "learning_rate": 9.448207179224487e-06, + "loss": 0.395, + "step": 1258 + }, + { + "epoch": 0.17657784011220196, + "grad_norm": 6.6509716614753405, + "learning_rate": 9.44716953201805e-06, + "loss": 0.4057, + "step": 1259 + }, + { + "epoch": 0.1767180925666199, + "grad_norm": 2.4270617657359415, + "learning_rate": 9.446130967169251e-06, + "loss": 0.4031, + "step": 1260 + }, + { + "epoch": 0.17685834502103787, + "grad_norm": 1.956520546921998, + "learning_rate": 9.44509148489239e-06, + "loss": 0.3916, + "step": 1261 + }, + { + "epoch": 0.17699859747545582, + "grad_norm": 2.619777024659837, + "learning_rate": 9.444051085401957e-06, + "loss": 0.3839, + "step": 1262 + }, + { + "epoch": 0.17713884992987378, + "grad_norm": 2.361267240510828, + "learning_rate": 9.44300976891263e-06, + "loss": 0.3769, + "step": 1263 + }, + { + "epoch": 0.17727910238429173, + "grad_norm": 2.458383160964393, + "learning_rate": 9.44196753563928e-06, + "loss": 0.4219, + "step": 1264 + }, + { + "epoch": 0.1774193548387097, + "grad_norm": 1.9114083708869307, + "learning_rate": 9.440924385796964e-06, + "loss": 0.392, + "step": 1265 + }, + { + "epoch": 0.17755960729312764, + "grad_norm": 1.7480757897399084, + "learning_rate": 9.439880319600924e-06, + "loss": 0.336, + "step": 1266 + }, + { + "epoch": 0.17769985974754557, + "grad_norm": 2.565847304048959, + "learning_rate": 9.438835337266603e-06, + "loss": 0.4076, + "step": 1267 + }, + { + "epoch": 0.17784011220196352, + "grad_norm": 2.56245878232976, + "learning_rate": 9.43778943900962e-06, + "loss": 0.4294, + "step": 1268 + }, + { + "epoch": 0.17798036465638148, + "grad_norm": 2.130491565880132, + "learning_rate": 9.436742625045794e-06, + "loss": 0.3948, + "step": 1269 + }, + { + "epoch": 0.17812061711079943, + "grad_norm": 4.078736942608223, + "learning_rate": 9.435694895591124e-06, + "loss": 0.4387, + "step": 1270 + }, + { + "epoch": 0.1782608695652174, + "grad_norm": 2.231195388253836, + "learning_rate": 9.434646250861801e-06, + "loss": 0.3925, + "step": 1271 + }, + { + "epoch": 0.17840112201963534, + "grad_norm": 2.37259783479781, + "learning_rate": 9.433596691074207e-06, + "loss": 0.3996, + "step": 1272 + }, + { + "epoch": 0.1785413744740533, + "grad_norm": 2.0671959793943393, + "learning_rate": 9.432546216444912e-06, + "loss": 0.401, + "step": 1273 + }, + { + "epoch": 0.17868162692847125, + "grad_norm": 2.1230714686003656, + "learning_rate": 9.431494827190673e-06, + "loss": 0.4037, + "step": 1274 + }, + { + "epoch": 0.1788218793828892, + "grad_norm": 2.3101467840615477, + "learning_rate": 9.430442523528437e-06, + "loss": 0.4683, + "step": 1275 + }, + { + "epoch": 0.17896213183730716, + "grad_norm": 2.1702603712199924, + "learning_rate": 9.429389305675342e-06, + "loss": 0.3653, + "step": 1276 + }, + { + "epoch": 0.1791023842917251, + "grad_norm": 1.916233248702735, + "learning_rate": 9.428335173848708e-06, + "loss": 0.4099, + "step": 1277 + }, + { + "epoch": 0.17924263674614305, + "grad_norm": 2.026866349476185, + "learning_rate": 9.427280128266049e-06, + "loss": 0.4409, + "step": 1278 + }, + { + "epoch": 0.179382889200561, + "grad_norm": 1.7998743649188929, + "learning_rate": 9.42622416914507e-06, + "loss": 0.3886, + "step": 1279 + }, + { + "epoch": 0.17952314165497896, + "grad_norm": 2.4149245417398375, + "learning_rate": 9.425167296703655e-06, + "loss": 0.3896, + "step": 1280 + }, + { + "epoch": 0.1796633941093969, + "grad_norm": 3.1346046858024827, + "learning_rate": 9.424109511159887e-06, + "loss": 0.3891, + "step": 1281 + }, + { + "epoch": 0.17980364656381487, + "grad_norm": 1.8760595506842588, + "learning_rate": 9.423050812732029e-06, + "loss": 0.4091, + "step": 1282 + }, + { + "epoch": 0.17994389901823282, + "grad_norm": 2.065049762818974, + "learning_rate": 9.421991201638539e-06, + "loss": 0.3934, + "step": 1283 + }, + { + "epoch": 0.18008415147265078, + "grad_norm": 3.8631468345470634, + "learning_rate": 9.420930678098057e-06, + "loss": 0.4096, + "step": 1284 + }, + { + "epoch": 0.18022440392706873, + "grad_norm": 2.5088842840896723, + "learning_rate": 9.419869242329417e-06, + "loss": 0.484, + "step": 1285 + }, + { + "epoch": 0.1803646563814867, + "grad_norm": 2.3024466272519746, + "learning_rate": 9.41880689455164e-06, + "loss": 0.3983, + "step": 1286 + }, + { + "epoch": 0.18050490883590461, + "grad_norm": 5.286998802581762, + "learning_rate": 9.417743634983933e-06, + "loss": 0.3722, + "step": 1287 + }, + { + "epoch": 0.18064516129032257, + "grad_norm": 1.9964600467711022, + "learning_rate": 9.416679463845691e-06, + "loss": 0.4233, + "step": 1288 + }, + { + "epoch": 0.18078541374474053, + "grad_norm": 1.7774773516063807, + "learning_rate": 9.415614381356496e-06, + "loss": 0.4363, + "step": 1289 + }, + { + "epoch": 0.18092566619915848, + "grad_norm": 2.0877998410820227, + "learning_rate": 9.414548387736127e-06, + "loss": 0.377, + "step": 1290 + }, + { + "epoch": 0.18106591865357644, + "grad_norm": 2.008161699973019, + "learning_rate": 9.413481483204541e-06, + "loss": 0.3891, + "step": 1291 + }, + { + "epoch": 0.1812061711079944, + "grad_norm": 1.9249842234373735, + "learning_rate": 9.412413667981884e-06, + "loss": 0.4526, + "step": 1292 + }, + { + "epoch": 0.18134642356241235, + "grad_norm": 2.222781963849238, + "learning_rate": 9.411344942288493e-06, + "loss": 0.3976, + "step": 1293 + }, + { + "epoch": 0.1814866760168303, + "grad_norm": 1.9944357359414755, + "learning_rate": 9.410275306344895e-06, + "loss": 0.4038, + "step": 1294 + }, + { + "epoch": 0.18162692847124826, + "grad_norm": 2.1647634094154253, + "learning_rate": 9.409204760371803e-06, + "loss": 0.4259, + "step": 1295 + }, + { + "epoch": 0.1817671809256662, + "grad_norm": 2.0766959188410454, + "learning_rate": 9.40813330459011e-06, + "loss": 0.4083, + "step": 1296 + }, + { + "epoch": 0.18190743338008414, + "grad_norm": 1.8051194503864763, + "learning_rate": 9.407060939220907e-06, + "loss": 0.4057, + "step": 1297 + }, + { + "epoch": 0.1820476858345021, + "grad_norm": 2.2887919920356232, + "learning_rate": 9.405987664485472e-06, + "loss": 0.4404, + "step": 1298 + }, + { + "epoch": 0.18218793828892005, + "grad_norm": 2.82084857780661, + "learning_rate": 9.404913480605264e-06, + "loss": 0.3819, + "step": 1299 + }, + { + "epoch": 0.182328190743338, + "grad_norm": 2.7994823556334794, + "learning_rate": 9.403838387801933e-06, + "loss": 0.477, + "step": 1300 + }, + { + "epoch": 0.18246844319775596, + "grad_norm": 5.310348793255512, + "learning_rate": 9.40276238629732e-06, + "loss": 0.4191, + "step": 1301 + }, + { + "epoch": 0.1826086956521739, + "grad_norm": 2.2485722144238562, + "learning_rate": 9.401685476313447e-06, + "loss": 0.4196, + "step": 1302 + }, + { + "epoch": 0.18274894810659187, + "grad_norm": 2.3517466121343826, + "learning_rate": 9.400607658072531e-06, + "loss": 0.3727, + "step": 1303 + }, + { + "epoch": 0.18288920056100982, + "grad_norm": 2.7313407411494257, + "learning_rate": 9.399528931796968e-06, + "loss": 0.4151, + "step": 1304 + }, + { + "epoch": 0.18302945301542778, + "grad_norm": 2.6340662294488815, + "learning_rate": 9.398449297709349e-06, + "loss": 0.4394, + "step": 1305 + }, + { + "epoch": 0.18316970546984573, + "grad_norm": 2.001794963264513, + "learning_rate": 9.397368756032445e-06, + "loss": 0.4329, + "step": 1306 + }, + { + "epoch": 0.18330995792426366, + "grad_norm": 2.3730756090724356, + "learning_rate": 9.396287306989224e-06, + "loss": 0.4592, + "step": 1307 + }, + { + "epoch": 0.18345021037868162, + "grad_norm": 2.6745381607240053, + "learning_rate": 9.39520495080283e-06, + "loss": 0.4108, + "step": 1308 + }, + { + "epoch": 0.18359046283309957, + "grad_norm": 1.9216225931297322, + "learning_rate": 9.394121687696602e-06, + "loss": 0.4359, + "step": 1309 + }, + { + "epoch": 0.18373071528751753, + "grad_norm": 1.9238677199883212, + "learning_rate": 9.393037517894063e-06, + "loss": 0.3791, + "step": 1310 + }, + { + "epoch": 0.18387096774193548, + "grad_norm": 2.4132352232441616, + "learning_rate": 9.391952441618926e-06, + "loss": 0.4041, + "step": 1311 + }, + { + "epoch": 0.18401122019635344, + "grad_norm": 1.878841343510048, + "learning_rate": 9.390866459095085e-06, + "loss": 0.3719, + "step": 1312 + }, + { + "epoch": 0.1841514726507714, + "grad_norm": 5.015415460600611, + "learning_rate": 9.389779570546628e-06, + "loss": 0.3606, + "step": 1313 + }, + { + "epoch": 0.18429172510518935, + "grad_norm": 3.078994550699514, + "learning_rate": 9.388691776197827e-06, + "loss": 0.4732, + "step": 1314 + }, + { + "epoch": 0.1844319775596073, + "grad_norm": 2.5962899881514936, + "learning_rate": 9.38760307627314e-06, + "loss": 0.3743, + "step": 1315 + }, + { + "epoch": 0.18457223001402526, + "grad_norm": 1.8070094376503871, + "learning_rate": 9.38651347099721e-06, + "loss": 0.3603, + "step": 1316 + }, + { + "epoch": 0.18471248246844318, + "grad_norm": 2.8092288173370177, + "learning_rate": 9.385422960594875e-06, + "loss": 0.4026, + "step": 1317 + }, + { + "epoch": 0.18485273492286114, + "grad_norm": 2.2046945811049135, + "learning_rate": 9.384331545291149e-06, + "loss": 0.3708, + "step": 1318 + }, + { + "epoch": 0.1849929873772791, + "grad_norm": 2.891487776383314, + "learning_rate": 9.38323922531124e-06, + "loss": 0.4461, + "step": 1319 + }, + { + "epoch": 0.18513323983169705, + "grad_norm": 2.8598962527577965, + "learning_rate": 9.38214600088054e-06, + "loss": 0.3994, + "step": 1320 + }, + { + "epoch": 0.185273492286115, + "grad_norm": 2.067061725922081, + "learning_rate": 9.381051872224632e-06, + "loss": 0.4338, + "step": 1321 + }, + { + "epoch": 0.18541374474053296, + "grad_norm": 3.6242260600018565, + "learning_rate": 9.379956839569275e-06, + "loss": 0.4035, + "step": 1322 + }, + { + "epoch": 0.18555399719495091, + "grad_norm": 2.6338959680345133, + "learning_rate": 9.378860903140428e-06, + "loss": 0.3675, + "step": 1323 + }, + { + "epoch": 0.18569424964936887, + "grad_norm": 2.3073136190112105, + "learning_rate": 9.377764063164224e-06, + "loss": 0.3736, + "step": 1324 + }, + { + "epoch": 0.18583450210378682, + "grad_norm": 2.775240116212779, + "learning_rate": 9.376666319866993e-06, + "loss": 0.4577, + "step": 1325 + }, + { + "epoch": 0.18597475455820478, + "grad_norm": 2.0588283803225518, + "learning_rate": 9.375567673475246e-06, + "loss": 0.4372, + "step": 1326 + }, + { + "epoch": 0.1861150070126227, + "grad_norm": 1.9292353667954527, + "learning_rate": 9.374468124215676e-06, + "loss": 0.4161, + "step": 1327 + }, + { + "epoch": 0.18625525946704066, + "grad_norm": 1.9314512771358503, + "learning_rate": 9.373367672315174e-06, + "loss": 0.4031, + "step": 1328 + }, + { + "epoch": 0.18639551192145862, + "grad_norm": 1.967248131845537, + "learning_rate": 9.372266318000806e-06, + "loss": 0.3874, + "step": 1329 + }, + { + "epoch": 0.18653576437587657, + "grad_norm": 2.249186156693607, + "learning_rate": 9.371164061499831e-06, + "loss": 0.4165, + "step": 1330 + }, + { + "epoch": 0.18667601683029453, + "grad_norm": 2.184013421908207, + "learning_rate": 9.37006090303969e-06, + "loss": 0.3871, + "step": 1331 + }, + { + "epoch": 0.18681626928471248, + "grad_norm": 2.6464915820935078, + "learning_rate": 9.368956842848014e-06, + "loss": 0.4071, + "step": 1332 + }, + { + "epoch": 0.18695652173913044, + "grad_norm": 1.945588119162185, + "learning_rate": 9.367851881152618e-06, + "loss": 0.3855, + "step": 1333 + }, + { + "epoch": 0.1870967741935484, + "grad_norm": 2.4132731607041604, + "learning_rate": 9.366746018181503e-06, + "loss": 0.4268, + "step": 1334 + }, + { + "epoch": 0.18723702664796635, + "grad_norm": 2.343077703056088, + "learning_rate": 9.365639254162855e-06, + "loss": 0.3824, + "step": 1335 + }, + { + "epoch": 0.1873772791023843, + "grad_norm": 2.504778015443031, + "learning_rate": 9.364531589325048e-06, + "loss": 0.4229, + "step": 1336 + }, + { + "epoch": 0.18751753155680223, + "grad_norm": 4.257230106385012, + "learning_rate": 9.363423023896641e-06, + "loss": 0.4264, + "step": 1337 + }, + { + "epoch": 0.18765778401122019, + "grad_norm": 2.5754928561633474, + "learning_rate": 9.362313558106376e-06, + "loss": 0.3747, + "step": 1338 + }, + { + "epoch": 0.18779803646563814, + "grad_norm": 2.2302334689761425, + "learning_rate": 9.361203192183188e-06, + "loss": 0.3789, + "step": 1339 + }, + { + "epoch": 0.1879382889200561, + "grad_norm": 2.0942703283752526, + "learning_rate": 9.36009192635619e-06, + "loss": 0.4049, + "step": 1340 + }, + { + "epoch": 0.18807854137447405, + "grad_norm": 5.170522276656714, + "learning_rate": 9.358979760854686e-06, + "loss": 0.3775, + "step": 1341 + }, + { + "epoch": 0.188218793828892, + "grad_norm": 2.752491082810207, + "learning_rate": 9.357866695908162e-06, + "loss": 0.418, + "step": 1342 + }, + { + "epoch": 0.18835904628330996, + "grad_norm": 2.6952625214739037, + "learning_rate": 9.356752731746292e-06, + "loss": 0.4513, + "step": 1343 + }, + { + "epoch": 0.18849929873772792, + "grad_norm": 2.7231864248811504, + "learning_rate": 9.355637868598935e-06, + "loss": 0.4096, + "step": 1344 + }, + { + "epoch": 0.18863955119214587, + "grad_norm": 2.953142761499993, + "learning_rate": 9.354522106696133e-06, + "loss": 0.3857, + "step": 1345 + }, + { + "epoch": 0.18877980364656383, + "grad_norm": 2.09650422185122, + "learning_rate": 9.353405446268119e-06, + "loss": 0.3789, + "step": 1346 + }, + { + "epoch": 0.18892005610098175, + "grad_norm": 2.269377238683035, + "learning_rate": 9.352287887545305e-06, + "loss": 0.3962, + "step": 1347 + }, + { + "epoch": 0.1890603085553997, + "grad_norm": 2.515045383567181, + "learning_rate": 9.351169430758293e-06, + "loss": 0.4204, + "step": 1348 + }, + { + "epoch": 0.18920056100981766, + "grad_norm": 2.3111362044284456, + "learning_rate": 9.350050076137871e-06, + "loss": 0.4028, + "step": 1349 + }, + { + "epoch": 0.18934081346423562, + "grad_norm": 2.980511786273907, + "learning_rate": 9.348929823915005e-06, + "loss": 0.4341, + "step": 1350 + }, + { + "epoch": 0.18948106591865357, + "grad_norm": 1.936557048096184, + "learning_rate": 9.347808674320855e-06, + "loss": 0.4075, + "step": 1351 + }, + { + "epoch": 0.18962131837307153, + "grad_norm": 2.6976251825200275, + "learning_rate": 9.346686627586761e-06, + "loss": 0.4315, + "step": 1352 + }, + { + "epoch": 0.18976157082748948, + "grad_norm": 2.407604480349974, + "learning_rate": 9.345563683944249e-06, + "loss": 0.3889, + "step": 1353 + }, + { + "epoch": 0.18990182328190744, + "grad_norm": 2.501290465127207, + "learning_rate": 9.344439843625034e-06, + "loss": 0.4287, + "step": 1354 + }, + { + "epoch": 0.1900420757363254, + "grad_norm": 2.50757177035308, + "learning_rate": 9.343315106861008e-06, + "loss": 0.3932, + "step": 1355 + }, + { + "epoch": 0.19018232819074335, + "grad_norm": 1.9337898540273668, + "learning_rate": 9.342189473884254e-06, + "loss": 0.4143, + "step": 1356 + }, + { + "epoch": 0.19032258064516128, + "grad_norm": 3.0467529272420832, + "learning_rate": 9.341062944927039e-06, + "loss": 0.4613, + "step": 1357 + }, + { + "epoch": 0.19046283309957923, + "grad_norm": 2.042810262509889, + "learning_rate": 9.339935520221816e-06, + "loss": 0.4288, + "step": 1358 + }, + { + "epoch": 0.1906030855539972, + "grad_norm": 2.2597999141534757, + "learning_rate": 9.338807200001218e-06, + "loss": 0.3858, + "step": 1359 + }, + { + "epoch": 0.19074333800841514, + "grad_norm": 2.200438031025038, + "learning_rate": 9.337677984498069e-06, + "loss": 0.389, + "step": 1360 + }, + { + "epoch": 0.1908835904628331, + "grad_norm": 2.014207089888786, + "learning_rate": 9.336547873945372e-06, + "loss": 0.3699, + "step": 1361 + }, + { + "epoch": 0.19102384291725105, + "grad_norm": 1.9686073978186245, + "learning_rate": 9.33541686857632e-06, + "loss": 0.3764, + "step": 1362 + }, + { + "epoch": 0.191164095371669, + "grad_norm": 2.473033713506233, + "learning_rate": 9.334284968624286e-06, + "loss": 0.435, + "step": 1363 + }, + { + "epoch": 0.19130434782608696, + "grad_norm": 2.099282996071567, + "learning_rate": 9.33315217432283e-06, + "loss": 0.4149, + "step": 1364 + }, + { + "epoch": 0.19144460028050492, + "grad_norm": 2.178538467372289, + "learning_rate": 9.332018485905699e-06, + "loss": 0.386, + "step": 1365 + }, + { + "epoch": 0.19158485273492287, + "grad_norm": 2.190162346453066, + "learning_rate": 9.330883903606816e-06, + "loss": 0.3973, + "step": 1366 + }, + { + "epoch": 0.1917251051893408, + "grad_norm": 2.5280598446969447, + "learning_rate": 9.329748427660299e-06, + "loss": 0.411, + "step": 1367 + }, + { + "epoch": 0.19186535764375875, + "grad_norm": 2.2984575282403537, + "learning_rate": 9.328612058300443e-06, + "loss": 0.4265, + "step": 1368 + }, + { + "epoch": 0.1920056100981767, + "grad_norm": 2.063347151201084, + "learning_rate": 9.327474795761734e-06, + "loss": 0.4387, + "step": 1369 + }, + { + "epoch": 0.19214586255259467, + "grad_norm": 2.006159595579826, + "learning_rate": 9.326336640278831e-06, + "loss": 0.3644, + "step": 1370 + }, + { + "epoch": 0.19228611500701262, + "grad_norm": 3.438110158694435, + "learning_rate": 9.32519759208659e-06, + "loss": 0.3748, + "step": 1371 + }, + { + "epoch": 0.19242636746143058, + "grad_norm": 2.0239572925208282, + "learning_rate": 9.324057651420045e-06, + "loss": 0.4758, + "step": 1372 + }, + { + "epoch": 0.19256661991584853, + "grad_norm": 2.21528236311621, + "learning_rate": 9.322916818514414e-06, + "loss": 0.383, + "step": 1373 + }, + { + "epoch": 0.19270687237026649, + "grad_norm": 2.3135107769736756, + "learning_rate": 9.321775093605097e-06, + "loss": 0.3693, + "step": 1374 + }, + { + "epoch": 0.19284712482468444, + "grad_norm": 2.2661269059498035, + "learning_rate": 9.320632476927687e-06, + "loss": 0.4537, + "step": 1375 + }, + { + "epoch": 0.1929873772791024, + "grad_norm": 3.500027656445907, + "learning_rate": 9.31948896871795e-06, + "loss": 0.4226, + "step": 1376 + }, + { + "epoch": 0.19312762973352032, + "grad_norm": 2.7175424459137743, + "learning_rate": 9.318344569211843e-06, + "loss": 0.3953, + "step": 1377 + }, + { + "epoch": 0.19326788218793828, + "grad_norm": 2.4276732899721574, + "learning_rate": 9.317199278645506e-06, + "loss": 0.447, + "step": 1378 + }, + { + "epoch": 0.19340813464235623, + "grad_norm": 3.0923585170740013, + "learning_rate": 9.316053097255258e-06, + "loss": 0.4311, + "step": 1379 + }, + { + "epoch": 0.1935483870967742, + "grad_norm": 2.499313641744383, + "learning_rate": 9.314906025277609e-06, + "loss": 0.4257, + "step": 1380 + }, + { + "epoch": 0.19368863955119214, + "grad_norm": 2.2872057991322574, + "learning_rate": 9.313758062949246e-06, + "loss": 0.4493, + "step": 1381 + }, + { + "epoch": 0.1938288920056101, + "grad_norm": 4.697298515347429, + "learning_rate": 9.312609210507046e-06, + "loss": 0.4156, + "step": 1382 + }, + { + "epoch": 0.19396914446002805, + "grad_norm": 2.701639285300845, + "learning_rate": 9.311459468188066e-06, + "loss": 0.3711, + "step": 1383 + }, + { + "epoch": 0.194109396914446, + "grad_norm": 2.3189878439442775, + "learning_rate": 9.310308836229548e-06, + "loss": 0.4267, + "step": 1384 + }, + { + "epoch": 0.19424964936886396, + "grad_norm": 4.523857874599961, + "learning_rate": 9.309157314868916e-06, + "loss": 0.4061, + "step": 1385 + }, + { + "epoch": 0.19438990182328192, + "grad_norm": 2.713873600686242, + "learning_rate": 9.308004904343776e-06, + "loss": 0.4249, + "step": 1386 + }, + { + "epoch": 0.19453015427769985, + "grad_norm": 1.772783738684182, + "learning_rate": 9.306851604891925e-06, + "loss": 0.4243, + "step": 1387 + }, + { + "epoch": 0.1946704067321178, + "grad_norm": 2.4064009978241576, + "learning_rate": 9.305697416751333e-06, + "loss": 0.4165, + "step": 1388 + }, + { + "epoch": 0.19481065918653576, + "grad_norm": 2.1153757148502077, + "learning_rate": 9.304542340160162e-06, + "loss": 0.3876, + "step": 1389 + }, + { + "epoch": 0.1949509116409537, + "grad_norm": 2.384502474475808, + "learning_rate": 9.303386375356752e-06, + "loss": 0.4414, + "step": 1390 + }, + { + "epoch": 0.19509116409537167, + "grad_norm": 2.489382896707716, + "learning_rate": 9.302229522579631e-06, + "loss": 0.4028, + "step": 1391 + }, + { + "epoch": 0.19523141654978962, + "grad_norm": 1.977480588448759, + "learning_rate": 9.301071782067504e-06, + "loss": 0.4126, + "step": 1392 + }, + { + "epoch": 0.19537166900420758, + "grad_norm": 2.118782709369969, + "learning_rate": 9.299913154059265e-06, + "loss": 0.4234, + "step": 1393 + }, + { + "epoch": 0.19551192145862553, + "grad_norm": 2.380494137444876, + "learning_rate": 9.29875363879399e-06, + "loss": 0.4174, + "step": 1394 + }, + { + "epoch": 0.1956521739130435, + "grad_norm": 2.6250699351849307, + "learning_rate": 9.297593236510933e-06, + "loss": 0.458, + "step": 1395 + }, + { + "epoch": 0.19579242636746144, + "grad_norm": 2.9620154841686075, + "learning_rate": 9.296431947449538e-06, + "loss": 0.4409, + "step": 1396 + }, + { + "epoch": 0.19593267882187937, + "grad_norm": 2.3089261662840843, + "learning_rate": 9.295269771849426e-06, + "loss": 0.4022, + "step": 1397 + }, + { + "epoch": 0.19607293127629732, + "grad_norm": 1.9510896377110778, + "learning_rate": 9.294106709950408e-06, + "loss": 0.4083, + "step": 1398 + }, + { + "epoch": 0.19621318373071528, + "grad_norm": 2.206352196938933, + "learning_rate": 9.292942761992471e-06, + "loss": 0.3777, + "step": 1399 + }, + { + "epoch": 0.19635343618513323, + "grad_norm": 1.8893318482776549, + "learning_rate": 9.291777928215787e-06, + "loss": 0.4069, + "step": 1400 + }, + { + "epoch": 0.1964936886395512, + "grad_norm": 2.592875505582783, + "learning_rate": 9.290612208860713e-06, + "loss": 0.3711, + "step": 1401 + }, + { + "epoch": 0.19663394109396914, + "grad_norm": 2.2361284325075563, + "learning_rate": 9.289445604167786e-06, + "loss": 0.4059, + "step": 1402 + }, + { + "epoch": 0.1967741935483871, + "grad_norm": 1.8215574010599955, + "learning_rate": 9.288278114377727e-06, + "loss": 0.4116, + "step": 1403 + }, + { + "epoch": 0.19691444600280505, + "grad_norm": 2.4257357434904807, + "learning_rate": 9.28710973973144e-06, + "loss": 0.4123, + "step": 1404 + }, + { + "epoch": 0.197054698457223, + "grad_norm": 2.92372537864309, + "learning_rate": 9.28594048047001e-06, + "loss": 0.4824, + "step": 1405 + }, + { + "epoch": 0.19719495091164096, + "grad_norm": 2.236373541091785, + "learning_rate": 9.284770336834706e-06, + "loss": 0.393, + "step": 1406 + }, + { + "epoch": 0.1973352033660589, + "grad_norm": 2.3508061304674217, + "learning_rate": 9.283599309066977e-06, + "loss": 0.4082, + "step": 1407 + }, + { + "epoch": 0.19747545582047685, + "grad_norm": 6.242480828527329, + "learning_rate": 9.28242739740846e-06, + "loss": 0.3843, + "step": 1408 + }, + { + "epoch": 0.1976157082748948, + "grad_norm": 2.343863014039593, + "learning_rate": 9.281254602100968e-06, + "loss": 0.415, + "step": 1409 + }, + { + "epoch": 0.19775596072931276, + "grad_norm": 2.5285849022063736, + "learning_rate": 9.280080923386501e-06, + "loss": 0.4191, + "step": 1410 + }, + { + "epoch": 0.1978962131837307, + "grad_norm": 2.886189454816124, + "learning_rate": 9.278906361507238e-06, + "loss": 0.3954, + "step": 1411 + }, + { + "epoch": 0.19803646563814867, + "grad_norm": 2.471271434453646, + "learning_rate": 9.277730916705544e-06, + "loss": 0.3775, + "step": 1412 + }, + { + "epoch": 0.19817671809256662, + "grad_norm": 3.3561141400392924, + "learning_rate": 9.276554589223958e-06, + "loss": 0.4067, + "step": 1413 + }, + { + "epoch": 0.19831697054698458, + "grad_norm": 2.1284411000943964, + "learning_rate": 9.275377379305214e-06, + "loss": 0.3623, + "step": 1414 + }, + { + "epoch": 0.19845722300140253, + "grad_norm": 2.1647020627436877, + "learning_rate": 9.274199287192215e-06, + "loss": 0.3959, + "step": 1415 + }, + { + "epoch": 0.1985974754558205, + "grad_norm": 1.9409602859970183, + "learning_rate": 9.273020313128059e-06, + "loss": 0.3607, + "step": 1416 + }, + { + "epoch": 0.19873772791023842, + "grad_norm": 2.194122643623745, + "learning_rate": 9.271840457356013e-06, + "loss": 0.3979, + "step": 1417 + }, + { + "epoch": 0.19887798036465637, + "grad_norm": 2.058356197194156, + "learning_rate": 9.270659720119533e-06, + "loss": 0.4303, + "step": 1418 + }, + { + "epoch": 0.19901823281907433, + "grad_norm": 2.9008734407166363, + "learning_rate": 9.26947810166226e-06, + "loss": 0.4401, + "step": 1419 + }, + { + "epoch": 0.19915848527349228, + "grad_norm": 2.3008698477145173, + "learning_rate": 9.268295602228005e-06, + "loss": 0.3912, + "step": 1420 + }, + { + "epoch": 0.19929873772791024, + "grad_norm": 1.9900117249838303, + "learning_rate": 9.267112222060777e-06, + "loss": 0.4056, + "step": 1421 + }, + { + "epoch": 0.1994389901823282, + "grad_norm": 1.6835911301594684, + "learning_rate": 9.265927961404754e-06, + "loss": 0.4402, + "step": 1422 + }, + { + "epoch": 0.19957924263674615, + "grad_norm": 2.769281760789626, + "learning_rate": 9.2647428205043e-06, + "loss": 0.4287, + "step": 1423 + }, + { + "epoch": 0.1997194950911641, + "grad_norm": 2.4279233163856357, + "learning_rate": 9.26355679960396e-06, + "loss": 0.391, + "step": 1424 + }, + { + "epoch": 0.19985974754558206, + "grad_norm": 2.7814875297698856, + "learning_rate": 9.262369898948462e-06, + "loss": 0.3719, + "step": 1425 + }, + { + "epoch": 0.2, + "grad_norm": 2.462462712504218, + "learning_rate": 9.261182118782714e-06, + "loss": 0.3894, + "step": 1426 + }, + { + "epoch": 0.20014025245441797, + "grad_norm": 2.2714046028017347, + "learning_rate": 9.259993459351809e-06, + "loss": 0.3685, + "step": 1427 + }, + { + "epoch": 0.2002805049088359, + "grad_norm": 1.8996433776661321, + "learning_rate": 9.258803920901014e-06, + "loss": 0.3699, + "step": 1428 + }, + { + "epoch": 0.20042075736325385, + "grad_norm": 2.4279677999268627, + "learning_rate": 9.257613503675787e-06, + "loss": 0.3952, + "step": 1429 + }, + { + "epoch": 0.2005610098176718, + "grad_norm": 2.5510466877109206, + "learning_rate": 9.256422207921757e-06, + "loss": 0.3901, + "step": 1430 + }, + { + "epoch": 0.20070126227208976, + "grad_norm": 2.087034815506554, + "learning_rate": 9.255230033884743e-06, + "loss": 0.3665, + "step": 1431 + }, + { + "epoch": 0.2008415147265077, + "grad_norm": 1.796151056823173, + "learning_rate": 9.254036981810741e-06, + "loss": 0.4049, + "step": 1432 + }, + { + "epoch": 0.20098176718092567, + "grad_norm": 2.5513892858407226, + "learning_rate": 9.252843051945928e-06, + "loss": 0.3709, + "step": 1433 + }, + { + "epoch": 0.20112201963534362, + "grad_norm": 2.0344492440219453, + "learning_rate": 9.251648244536665e-06, + "loss": 0.4165, + "step": 1434 + }, + { + "epoch": 0.20126227208976158, + "grad_norm": 2.146802689446459, + "learning_rate": 9.25045255982949e-06, + "loss": 0.4006, + "step": 1435 + }, + { + "epoch": 0.20140252454417953, + "grad_norm": 2.0697541257630774, + "learning_rate": 9.249255998071127e-06, + "loss": 0.4222, + "step": 1436 + }, + { + "epoch": 0.2015427769985975, + "grad_norm": 2.232451778288349, + "learning_rate": 9.248058559508476e-06, + "loss": 0.3758, + "step": 1437 + }, + { + "epoch": 0.20168302945301542, + "grad_norm": 3.4215996339688517, + "learning_rate": 9.246860244388621e-06, + "loss": 0.4274, + "step": 1438 + }, + { + "epoch": 0.20182328190743337, + "grad_norm": 2.863454607910686, + "learning_rate": 9.245661052958823e-06, + "loss": 0.3895, + "step": 1439 + }, + { + "epoch": 0.20196353436185133, + "grad_norm": 2.1933680709978995, + "learning_rate": 9.244460985466532e-06, + "loss": 0.3714, + "step": 1440 + }, + { + "epoch": 0.20210378681626928, + "grad_norm": 2.669707104765996, + "learning_rate": 9.24326004215937e-06, + "loss": 0.4157, + "step": 1441 + }, + { + "epoch": 0.20224403927068724, + "grad_norm": 3.4700354866170096, + "learning_rate": 9.242058223285143e-06, + "loss": 0.3998, + "step": 1442 + }, + { + "epoch": 0.2023842917251052, + "grad_norm": 2.505279592873909, + "learning_rate": 9.24085552909184e-06, + "loss": 0.4437, + "step": 1443 + }, + { + "epoch": 0.20252454417952315, + "grad_norm": 3.434467434528662, + "learning_rate": 9.239651959827627e-06, + "loss": 0.3708, + "step": 1444 + }, + { + "epoch": 0.2026647966339411, + "grad_norm": 2.5130967413682725, + "learning_rate": 9.238447515740853e-06, + "loss": 0.4148, + "step": 1445 + }, + { + "epoch": 0.20280504908835906, + "grad_norm": 2.105501022516173, + "learning_rate": 9.237242197080045e-06, + "loss": 0.411, + "step": 1446 + }, + { + "epoch": 0.202945301542777, + "grad_norm": 1.8854235513747604, + "learning_rate": 9.236036004093916e-06, + "loss": 0.3427, + "step": 1447 + }, + { + "epoch": 0.20308555399719494, + "grad_norm": 2.55042155652096, + "learning_rate": 9.23482893703135e-06, + "loss": 0.3727, + "step": 1448 + }, + { + "epoch": 0.2032258064516129, + "grad_norm": 2.420095368232234, + "learning_rate": 9.233620996141421e-06, + "loss": 0.4375, + "step": 1449 + }, + { + "epoch": 0.20336605890603085, + "grad_norm": 2.0828050960972937, + "learning_rate": 9.232412181673378e-06, + "loss": 0.3818, + "step": 1450 + }, + { + "epoch": 0.2035063113604488, + "grad_norm": 4.233784426787666, + "learning_rate": 9.231202493876652e-06, + "loss": 0.434, + "step": 1451 + }, + { + "epoch": 0.20364656381486676, + "grad_norm": 2.1116800782405782, + "learning_rate": 9.229991933000852e-06, + "loss": 0.4015, + "step": 1452 + }, + { + "epoch": 0.20378681626928472, + "grad_norm": 2.615944911849222, + "learning_rate": 9.22878049929577e-06, + "loss": 0.3836, + "step": 1453 + }, + { + "epoch": 0.20392706872370267, + "grad_norm": 4.375384504587943, + "learning_rate": 9.227568193011375e-06, + "loss": 0.4528, + "step": 1454 + }, + { + "epoch": 0.20406732117812063, + "grad_norm": 3.3040938328864926, + "learning_rate": 9.226355014397823e-06, + "loss": 0.4139, + "step": 1455 + }, + { + "epoch": 0.20420757363253858, + "grad_norm": 2.314455803334421, + "learning_rate": 9.225140963705439e-06, + "loss": 0.3664, + "step": 1456 + }, + { + "epoch": 0.20434782608695654, + "grad_norm": 2.0065778803061414, + "learning_rate": 9.223926041184737e-06, + "loss": 0.385, + "step": 1457 + }, + { + "epoch": 0.20448807854137446, + "grad_norm": 2.186802998262635, + "learning_rate": 9.222710247086405e-06, + "loss": 0.4081, + "step": 1458 + }, + { + "epoch": 0.20462833099579242, + "grad_norm": 2.475878696323528, + "learning_rate": 9.221493581661318e-06, + "loss": 0.4083, + "step": 1459 + }, + { + "epoch": 0.20476858345021037, + "grad_norm": 2.719296630627898, + "learning_rate": 9.220276045160524e-06, + "loss": 0.4588, + "step": 1460 + }, + { + "epoch": 0.20490883590462833, + "grad_norm": 2.2386737366194165, + "learning_rate": 9.219057637835252e-06, + "loss": 0.3836, + "step": 1461 + }, + { + "epoch": 0.20504908835904628, + "grad_norm": 2.3176780670132886, + "learning_rate": 9.217838359936914e-06, + "loss": 0.4386, + "step": 1462 + }, + { + "epoch": 0.20518934081346424, + "grad_norm": 2.8205283127317022, + "learning_rate": 9.216618211717098e-06, + "loss": 0.404, + "step": 1463 + }, + { + "epoch": 0.2053295932678822, + "grad_norm": 2.394257123729619, + "learning_rate": 9.215397193427575e-06, + "loss": 0.3781, + "step": 1464 + }, + { + "epoch": 0.20546984572230015, + "grad_norm": 3.109693654663509, + "learning_rate": 9.21417530532029e-06, + "loss": 0.4384, + "step": 1465 + }, + { + "epoch": 0.2056100981767181, + "grad_norm": 2.2488498927221303, + "learning_rate": 9.212952547647375e-06, + "loss": 0.4156, + "step": 1466 + }, + { + "epoch": 0.20575035063113606, + "grad_norm": 2.986240782374211, + "learning_rate": 9.211728920661136e-06, + "loss": 0.4524, + "step": 1467 + }, + { + "epoch": 0.20589060308555399, + "grad_norm": 2.354129745959072, + "learning_rate": 9.21050442461406e-06, + "loss": 0.3689, + "step": 1468 + }, + { + "epoch": 0.20603085553997194, + "grad_norm": 2.8101150573307514, + "learning_rate": 9.20927905975881e-06, + "loss": 0.427, + "step": 1469 + }, + { + "epoch": 0.2061711079943899, + "grad_norm": 2.3664697748765042, + "learning_rate": 9.208052826348238e-06, + "loss": 0.4078, + "step": 1470 + }, + { + "epoch": 0.20631136044880785, + "grad_norm": 3.456054549528466, + "learning_rate": 9.206825724635363e-06, + "loss": 0.4205, + "step": 1471 + }, + { + "epoch": 0.2064516129032258, + "grad_norm": 2.734210548905228, + "learning_rate": 9.205597754873391e-06, + "loss": 0.3855, + "step": 1472 + }, + { + "epoch": 0.20659186535764376, + "grad_norm": 2.876762679064388, + "learning_rate": 9.204368917315706e-06, + "loss": 0.4278, + "step": 1473 + }, + { + "epoch": 0.20673211781206172, + "grad_norm": 2.3318016156438435, + "learning_rate": 9.203139212215868e-06, + "loss": 0.4172, + "step": 1474 + }, + { + "epoch": 0.20687237026647967, + "grad_norm": 2.9254057252330936, + "learning_rate": 9.201908639827619e-06, + "loss": 0.452, + "step": 1475 + }, + { + "epoch": 0.20701262272089763, + "grad_norm": 2.717553500269121, + "learning_rate": 9.20067720040488e-06, + "loss": 0.4106, + "step": 1476 + }, + { + "epoch": 0.20715287517531558, + "grad_norm": 2.114915029383557, + "learning_rate": 9.199444894201748e-06, + "loss": 0.4107, + "step": 1477 + }, + { + "epoch": 0.2072931276297335, + "grad_norm": 2.726447772478814, + "learning_rate": 9.198211721472503e-06, + "loss": 0.4128, + "step": 1478 + }, + { + "epoch": 0.20743338008415146, + "grad_norm": 2.5159765910043626, + "learning_rate": 9.1969776824716e-06, + "loss": 0.3829, + "step": 1479 + }, + { + "epoch": 0.20757363253856942, + "grad_norm": 2.1955806256204426, + "learning_rate": 9.195742777453674e-06, + "loss": 0.3776, + "step": 1480 + }, + { + "epoch": 0.20771388499298737, + "grad_norm": 2.57472504194049, + "learning_rate": 9.19450700667354e-06, + "loss": 0.4003, + "step": 1481 + }, + { + "epoch": 0.20785413744740533, + "grad_norm": 2.0983922935041828, + "learning_rate": 9.193270370386188e-06, + "loss": 0.3897, + "step": 1482 + }, + { + "epoch": 0.20799438990182328, + "grad_norm": 2.236613825909605, + "learning_rate": 9.192032868846794e-06, + "loss": 0.4157, + "step": 1483 + }, + { + "epoch": 0.20813464235624124, + "grad_norm": 2.1168935649137723, + "learning_rate": 9.190794502310704e-06, + "loss": 0.4028, + "step": 1484 + }, + { + "epoch": 0.2082748948106592, + "grad_norm": 2.199760736112584, + "learning_rate": 9.18955527103345e-06, + "loss": 0.3974, + "step": 1485 + }, + { + "epoch": 0.20841514726507715, + "grad_norm": 1.7899034014801607, + "learning_rate": 9.188315175270735e-06, + "loss": 0.4169, + "step": 1486 + }, + { + "epoch": 0.2085553997194951, + "grad_norm": 2.1197719213767376, + "learning_rate": 9.187074215278444e-06, + "loss": 0.3827, + "step": 1487 + }, + { + "epoch": 0.20869565217391303, + "grad_norm": 6.672248347465336, + "learning_rate": 9.185832391312644e-06, + "loss": 0.4774, + "step": 1488 + }, + { + "epoch": 0.208835904628331, + "grad_norm": 2.302661276604301, + "learning_rate": 9.184589703629575e-06, + "loss": 0.4104, + "step": 1489 + }, + { + "epoch": 0.20897615708274894, + "grad_norm": 2.700757969321983, + "learning_rate": 9.183346152485654e-06, + "loss": 0.4013, + "step": 1490 + }, + { + "epoch": 0.2091164095371669, + "grad_norm": 1.9589725087073622, + "learning_rate": 9.182101738137483e-06, + "loss": 0.4001, + "step": 1491 + }, + { + "epoch": 0.20925666199158485, + "grad_norm": 3.34356817303709, + "learning_rate": 9.180856460841836e-06, + "loss": 0.3882, + "step": 1492 + }, + { + "epoch": 0.2093969144460028, + "grad_norm": 3.005874445371172, + "learning_rate": 9.17961032085567e-06, + "loss": 0.3971, + "step": 1493 + }, + { + "epoch": 0.20953716690042076, + "grad_norm": 1.9845622417389834, + "learning_rate": 9.178363318436115e-06, + "loss": 0.3454, + "step": 1494 + }, + { + "epoch": 0.20967741935483872, + "grad_norm": 2.2403692123422614, + "learning_rate": 9.17711545384048e-06, + "loss": 0.3917, + "step": 1495 + }, + { + "epoch": 0.20981767180925667, + "grad_norm": 2.483576808318475, + "learning_rate": 9.175866727326255e-06, + "loss": 0.3951, + "step": 1496 + }, + { + "epoch": 0.20995792426367463, + "grad_norm": 2.048815091712597, + "learning_rate": 9.174617139151108e-06, + "loss": 0.3927, + "step": 1497 + }, + { + "epoch": 0.21009817671809256, + "grad_norm": 1.9829325201506038, + "learning_rate": 9.173366689572878e-06, + "loss": 0.3798, + "step": 1498 + }, + { + "epoch": 0.2102384291725105, + "grad_norm": 1.8619798638046845, + "learning_rate": 9.172115378849588e-06, + "loss": 0.3571, + "step": 1499 + }, + { + "epoch": 0.21037868162692847, + "grad_norm": 1.9489584464218077, + "learning_rate": 9.170863207239442e-06, + "loss": 0.3732, + "step": 1500 + }, + { + "epoch": 0.21051893408134642, + "grad_norm": 2.0950831254295657, + "learning_rate": 9.169610175000812e-06, + "loss": 0.4218, + "step": 1501 + }, + { + "epoch": 0.21065918653576438, + "grad_norm": 1.8420837761407387, + "learning_rate": 9.168356282392253e-06, + "loss": 0.3985, + "step": 1502 + }, + { + "epoch": 0.21079943899018233, + "grad_norm": 2.015921282616536, + "learning_rate": 9.167101529672496e-06, + "loss": 0.4444, + "step": 1503 + }, + { + "epoch": 0.21093969144460029, + "grad_norm": 2.042364494283169, + "learning_rate": 9.165845917100454e-06, + "loss": 0.3971, + "step": 1504 + }, + { + "epoch": 0.21107994389901824, + "grad_norm": 2.4184792615984407, + "learning_rate": 9.164589444935211e-06, + "loss": 0.4013, + "step": 1505 + }, + { + "epoch": 0.2112201963534362, + "grad_norm": 2.601623167871354, + "learning_rate": 9.163332113436031e-06, + "loss": 0.3685, + "step": 1506 + }, + { + "epoch": 0.21136044880785415, + "grad_norm": 2.190409659822618, + "learning_rate": 9.16207392286236e-06, + "loss": 0.3305, + "step": 1507 + }, + { + "epoch": 0.21150070126227208, + "grad_norm": 2.541480031483082, + "learning_rate": 9.160814873473811e-06, + "loss": 0.4558, + "step": 1508 + }, + { + "epoch": 0.21164095371669003, + "grad_norm": 2.623477494262688, + "learning_rate": 9.159554965530184e-06, + "loss": 0.4059, + "step": 1509 + }, + { + "epoch": 0.211781206171108, + "grad_norm": 2.6626097893801357, + "learning_rate": 9.15829419929145e-06, + "loss": 0.4111, + "step": 1510 + }, + { + "epoch": 0.21192145862552594, + "grad_norm": 2.600870401235836, + "learning_rate": 9.157032575017762e-06, + "loss": 0.4285, + "step": 1511 + }, + { + "epoch": 0.2120617110799439, + "grad_norm": 1.8254412653221328, + "learning_rate": 9.155770092969443e-06, + "loss": 0.4262, + "step": 1512 + }, + { + "epoch": 0.21220196353436185, + "grad_norm": 2.2689426729131585, + "learning_rate": 9.154506753407e-06, + "loss": 0.4277, + "step": 1513 + }, + { + "epoch": 0.2123422159887798, + "grad_norm": 2.63141592775659, + "learning_rate": 9.153242556591115e-06, + "loss": 0.3921, + "step": 1514 + }, + { + "epoch": 0.21248246844319776, + "grad_norm": 2.239057209996259, + "learning_rate": 9.151977502782645e-06, + "loss": 0.3789, + "step": 1515 + }, + { + "epoch": 0.21262272089761572, + "grad_norm": 3.3126547435388036, + "learning_rate": 9.150711592242627e-06, + "loss": 0.4585, + "step": 1516 + }, + { + "epoch": 0.21276297335203367, + "grad_norm": 3.024333180475233, + "learning_rate": 9.149444825232269e-06, + "loss": 0.3752, + "step": 1517 + }, + { + "epoch": 0.2129032258064516, + "grad_norm": 2.314355054620543, + "learning_rate": 9.148177202012962e-06, + "loss": 0.4336, + "step": 1518 + }, + { + "epoch": 0.21304347826086956, + "grad_norm": 2.9054186872829684, + "learning_rate": 9.146908722846271e-06, + "loss": 0.3722, + "step": 1519 + }, + { + "epoch": 0.2131837307152875, + "grad_norm": 2.0111219391200548, + "learning_rate": 9.145639387993939e-06, + "loss": 0.4274, + "step": 1520 + }, + { + "epoch": 0.21332398316970547, + "grad_norm": 3.9138090923457325, + "learning_rate": 9.14436919771788e-06, + "loss": 0.4008, + "step": 1521 + }, + { + "epoch": 0.21346423562412342, + "grad_norm": 3.6613886009438525, + "learning_rate": 9.143098152280195e-06, + "loss": 0.427, + "step": 1522 + }, + { + "epoch": 0.21360448807854138, + "grad_norm": 2.242507007974925, + "learning_rate": 9.14182625194315e-06, + "loss": 0.4512, + "step": 1523 + }, + { + "epoch": 0.21374474053295933, + "grad_norm": 2.700592265691038, + "learning_rate": 9.140553496969195e-06, + "loss": 0.4084, + "step": 1524 + }, + { + "epoch": 0.2138849929873773, + "grad_norm": 2.586223206194609, + "learning_rate": 9.139279887620955e-06, + "loss": 0.344, + "step": 1525 + }, + { + "epoch": 0.21402524544179524, + "grad_norm": 2.480895190490221, + "learning_rate": 9.13800542416123e-06, + "loss": 0.377, + "step": 1526 + }, + { + "epoch": 0.2141654978962132, + "grad_norm": 2.6939844370210495, + "learning_rate": 9.136730106852995e-06, + "loss": 0.4272, + "step": 1527 + }, + { + "epoch": 0.21430575035063112, + "grad_norm": 2.335792970811034, + "learning_rate": 9.135453935959405e-06, + "loss": 0.3912, + "step": 1528 + }, + { + "epoch": 0.21444600280504908, + "grad_norm": 2.0814241307818375, + "learning_rate": 9.134176911743787e-06, + "loss": 0.418, + "step": 1529 + }, + { + "epoch": 0.21458625525946703, + "grad_norm": 2.561429055816967, + "learning_rate": 9.132899034469648e-06, + "loss": 0.4307, + "step": 1530 + }, + { + "epoch": 0.214726507713885, + "grad_norm": 2.2047880132244133, + "learning_rate": 9.131620304400667e-06, + "loss": 0.4589, + "step": 1531 + }, + { + "epoch": 0.21486676016830294, + "grad_norm": 2.092484504523836, + "learning_rate": 9.1303407218007e-06, + "loss": 0.4, + "step": 1532 + }, + { + "epoch": 0.2150070126227209, + "grad_norm": 2.417213531531512, + "learning_rate": 9.129060286933786e-06, + "loss": 0.4325, + "step": 1533 + }, + { + "epoch": 0.21514726507713886, + "grad_norm": 3.479223985637649, + "learning_rate": 9.127779000064127e-06, + "loss": 0.4306, + "step": 1534 + }, + { + "epoch": 0.2152875175315568, + "grad_norm": 2.045423968886132, + "learning_rate": 9.126496861456113e-06, + "loss": 0.3936, + "step": 1535 + }, + { + "epoch": 0.21542776998597477, + "grad_norm": 3.1986736350206884, + "learning_rate": 9.125213871374298e-06, + "loss": 0.3774, + "step": 1536 + }, + { + "epoch": 0.21556802244039272, + "grad_norm": 3.076003684638189, + "learning_rate": 9.123930030083425e-06, + "loss": 0.4076, + "step": 1537 + }, + { + "epoch": 0.21570827489481065, + "grad_norm": 2.372546484665667, + "learning_rate": 9.1226453378484e-06, + "loss": 0.4431, + "step": 1538 + }, + { + "epoch": 0.2158485273492286, + "grad_norm": 4.21751781412375, + "learning_rate": 9.121359794934312e-06, + "loss": 0.3694, + "step": 1539 + }, + { + "epoch": 0.21598877980364656, + "grad_norm": 2.9005920134733683, + "learning_rate": 9.120073401606427e-06, + "loss": 0.4202, + "step": 1540 + }, + { + "epoch": 0.2161290322580645, + "grad_norm": 1.9262216308207813, + "learning_rate": 9.11878615813018e-06, + "loss": 0.3786, + "step": 1541 + }, + { + "epoch": 0.21626928471248247, + "grad_norm": 3.183721359597388, + "learning_rate": 9.117498064771185e-06, + "loss": 0.3864, + "step": 1542 + }, + { + "epoch": 0.21640953716690042, + "grad_norm": 2.303215979509217, + "learning_rate": 9.11620912179523e-06, + "loss": 0.4329, + "step": 1543 + }, + { + "epoch": 0.21654978962131838, + "grad_norm": 2.265762745846327, + "learning_rate": 9.114919329468283e-06, + "loss": 0.4413, + "step": 1544 + }, + { + "epoch": 0.21669004207573633, + "grad_norm": 2.1847844298146994, + "learning_rate": 9.113628688056481e-06, + "loss": 0.3797, + "step": 1545 + }, + { + "epoch": 0.2168302945301543, + "grad_norm": 2.411927201654379, + "learning_rate": 9.112337197826138e-06, + "loss": 0.4022, + "step": 1546 + }, + { + "epoch": 0.21697054698457224, + "grad_norm": 3.038010442096778, + "learning_rate": 9.111044859043747e-06, + "loss": 0.3552, + "step": 1547 + }, + { + "epoch": 0.21711079943899017, + "grad_norm": 2.491823941203527, + "learning_rate": 9.10975167197597e-06, + "loss": 0.3809, + "step": 1548 + }, + { + "epoch": 0.21725105189340813, + "grad_norm": 2.0882758703137543, + "learning_rate": 9.10845763688965e-06, + "loss": 0.4056, + "step": 1549 + }, + { + "epoch": 0.21739130434782608, + "grad_norm": 1.8782589882074907, + "learning_rate": 9.107162754051799e-06, + "loss": 0.3244, + "step": 1550 + }, + { + "epoch": 0.21753155680224404, + "grad_norm": 1.838473382677785, + "learning_rate": 9.10586702372961e-06, + "loss": 0.3706, + "step": 1551 + }, + { + "epoch": 0.217671809256662, + "grad_norm": 5.9059399720218115, + "learning_rate": 9.104570446190445e-06, + "loss": 0.4006, + "step": 1552 + }, + { + "epoch": 0.21781206171107995, + "grad_norm": 4.79985167751032, + "learning_rate": 9.103273021701846e-06, + "loss": 0.3793, + "step": 1553 + }, + { + "epoch": 0.2179523141654979, + "grad_norm": 2.490988703153485, + "learning_rate": 9.101974750531526e-06, + "loss": 0.4171, + "step": 1554 + }, + { + "epoch": 0.21809256661991586, + "grad_norm": 2.8877136415655196, + "learning_rate": 9.100675632947376e-06, + "loss": 0.4251, + "step": 1555 + }, + { + "epoch": 0.2182328190743338, + "grad_norm": 2.589451088345622, + "learning_rate": 9.099375669217458e-06, + "loss": 0.4597, + "step": 1556 + }, + { + "epoch": 0.21837307152875177, + "grad_norm": 2.4756500777563692, + "learning_rate": 9.098074859610012e-06, + "loss": 0.3863, + "step": 1557 + }, + { + "epoch": 0.2185133239831697, + "grad_norm": 2.5113724489108415, + "learning_rate": 9.09677320439345e-06, + "loss": 0.3952, + "step": 1558 + }, + { + "epoch": 0.21865357643758765, + "grad_norm": 2.2457956663616634, + "learning_rate": 9.095470703836358e-06, + "loss": 0.3877, + "step": 1559 + }, + { + "epoch": 0.2187938288920056, + "grad_norm": 2.5690051061179324, + "learning_rate": 9.094167358207502e-06, + "loss": 0.3553, + "step": 1560 + }, + { + "epoch": 0.21893408134642356, + "grad_norm": 2.1940285399883583, + "learning_rate": 9.092863167775813e-06, + "loss": 0.3706, + "step": 1561 + }, + { + "epoch": 0.21907433380084151, + "grad_norm": 2.022564673975247, + "learning_rate": 9.091558132810407e-06, + "loss": 0.467, + "step": 1562 + }, + { + "epoch": 0.21921458625525947, + "grad_norm": 2.0309817797295735, + "learning_rate": 9.090252253580565e-06, + "loss": 0.3954, + "step": 1563 + }, + { + "epoch": 0.21935483870967742, + "grad_norm": 2.645635332122965, + "learning_rate": 9.088945530355746e-06, + "loss": 0.4259, + "step": 1564 + }, + { + "epoch": 0.21949509116409538, + "grad_norm": 2.6097401803141893, + "learning_rate": 9.087637963405586e-06, + "loss": 0.4323, + "step": 1565 + }, + { + "epoch": 0.21963534361851333, + "grad_norm": 2.6476916497937775, + "learning_rate": 9.08632955299989e-06, + "loss": 0.3635, + "step": 1566 + }, + { + "epoch": 0.2197755960729313, + "grad_norm": 2.8157562055935133, + "learning_rate": 9.085020299408642e-06, + "loss": 0.3463, + "step": 1567 + }, + { + "epoch": 0.21991584852734922, + "grad_norm": 2.0668448720004484, + "learning_rate": 9.083710202901994e-06, + "loss": 0.3498, + "step": 1568 + }, + { + "epoch": 0.22005610098176717, + "grad_norm": 2.5532363363057446, + "learning_rate": 9.082399263750276e-06, + "loss": 0.4385, + "step": 1569 + }, + { + "epoch": 0.22019635343618513, + "grad_norm": 2.5028252849616686, + "learning_rate": 9.081087482223993e-06, + "loss": 0.4157, + "step": 1570 + }, + { + "epoch": 0.22033660589060308, + "grad_norm": 3.2169142738140164, + "learning_rate": 9.07977485859382e-06, + "loss": 0.3789, + "step": 1571 + }, + { + "epoch": 0.22047685834502104, + "grad_norm": 2.9239779977349, + "learning_rate": 9.07846139313061e-06, + "loss": 0.3983, + "step": 1572 + }, + { + "epoch": 0.220617110799439, + "grad_norm": 1.8895485075321286, + "learning_rate": 9.077147086105382e-06, + "loss": 0.3849, + "step": 1573 + }, + { + "epoch": 0.22075736325385695, + "grad_norm": 2.5063486550717617, + "learning_rate": 9.075831937789341e-06, + "loss": 0.3572, + "step": 1574 + }, + { + "epoch": 0.2208976157082749, + "grad_norm": 4.627223949308136, + "learning_rate": 9.074515948453855e-06, + "loss": 0.3991, + "step": 1575 + }, + { + "epoch": 0.22103786816269286, + "grad_norm": 3.150065642384388, + "learning_rate": 9.073199118370471e-06, + "loss": 0.4272, + "step": 1576 + }, + { + "epoch": 0.2211781206171108, + "grad_norm": 2.62013666044059, + "learning_rate": 9.071881447810907e-06, + "loss": 0.395, + "step": 1577 + }, + { + "epoch": 0.22131837307152874, + "grad_norm": 2.704967422148151, + "learning_rate": 9.070562937047052e-06, + "loss": 0.3499, + "step": 1578 + }, + { + "epoch": 0.2214586255259467, + "grad_norm": 2.4416971506379226, + "learning_rate": 9.069243586350976e-06, + "loss": 0.3922, + "step": 1579 + }, + { + "epoch": 0.22159887798036465, + "grad_norm": 2.5300621730719137, + "learning_rate": 9.067923395994916e-06, + "loss": 0.3629, + "step": 1580 + }, + { + "epoch": 0.2217391304347826, + "grad_norm": 2.3300566372516127, + "learning_rate": 9.066602366251283e-06, + "loss": 0.4092, + "step": 1581 + }, + { + "epoch": 0.22187938288920056, + "grad_norm": 2.9805736998232613, + "learning_rate": 9.065280497392663e-06, + "loss": 0.3782, + "step": 1582 + }, + { + "epoch": 0.22201963534361852, + "grad_norm": 2.3213024147044465, + "learning_rate": 9.063957789691816e-06, + "loss": 0.3972, + "step": 1583 + }, + { + "epoch": 0.22215988779803647, + "grad_norm": 2.28036189101728, + "learning_rate": 9.06263424342167e-06, + "loss": 0.3893, + "step": 1584 + }, + { + "epoch": 0.22230014025245443, + "grad_norm": 3.1747847116124097, + "learning_rate": 9.061309858855334e-06, + "loss": 0.3958, + "step": 1585 + }, + { + "epoch": 0.22244039270687238, + "grad_norm": 2.012031960409847, + "learning_rate": 9.059984636266082e-06, + "loss": 0.3837, + "step": 1586 + }, + { + "epoch": 0.22258064516129034, + "grad_norm": 3.2309730809565287, + "learning_rate": 9.058658575927368e-06, + "loss": 0.3639, + "step": 1587 + }, + { + "epoch": 0.22272089761570826, + "grad_norm": 4.4926515555145805, + "learning_rate": 9.057331678112809e-06, + "loss": 0.3915, + "step": 1588 + }, + { + "epoch": 0.22286115007012622, + "grad_norm": 2.5871076903812176, + "learning_rate": 9.056003943096208e-06, + "loss": 0.4108, + "step": 1589 + }, + { + "epoch": 0.22300140252454417, + "grad_norm": 2.19299193758337, + "learning_rate": 9.05467537115153e-06, + "loss": 0.4002, + "step": 1590 + }, + { + "epoch": 0.22314165497896213, + "grad_norm": 2.6996796983124174, + "learning_rate": 9.053345962552915e-06, + "loss": 0.3931, + "step": 1591 + }, + { + "epoch": 0.22328190743338008, + "grad_norm": 2.591372951860609, + "learning_rate": 9.052015717574683e-06, + "loss": 0.4354, + "step": 1592 + }, + { + "epoch": 0.22342215988779804, + "grad_norm": 2.7543098316312573, + "learning_rate": 9.050684636491317e-06, + "loss": 0.4613, + "step": 1593 + }, + { + "epoch": 0.223562412342216, + "grad_norm": 2.5999256746965926, + "learning_rate": 9.049352719577474e-06, + "loss": 0.4052, + "step": 1594 + }, + { + "epoch": 0.22370266479663395, + "grad_norm": 3.433717206833776, + "learning_rate": 9.04801996710799e-06, + "loss": 0.3687, + "step": 1595 + }, + { + "epoch": 0.2238429172510519, + "grad_norm": 3.500176016604993, + "learning_rate": 9.046686379357867e-06, + "loss": 0.3675, + "step": 1596 + }, + { + "epoch": 0.22398316970546986, + "grad_norm": 2.290175652243173, + "learning_rate": 9.045351956602282e-06, + "loss": 0.3665, + "step": 1597 + }, + { + "epoch": 0.2241234221598878, + "grad_norm": 2.117190653105355, + "learning_rate": 9.044016699116584e-06, + "loss": 0.3596, + "step": 1598 + }, + { + "epoch": 0.22426367461430574, + "grad_norm": 2.229228782382571, + "learning_rate": 9.042680607176296e-06, + "loss": 0.4171, + "step": 1599 + }, + { + "epoch": 0.2244039270687237, + "grad_norm": 2.2541234062213307, + "learning_rate": 9.041343681057106e-06, + "loss": 0.3792, + "step": 1600 + }, + { + "epoch": 0.22454417952314165, + "grad_norm": 2.4005378200012486, + "learning_rate": 9.040005921034884e-06, + "loss": 0.4065, + "step": 1601 + }, + { + "epoch": 0.2246844319775596, + "grad_norm": 2.676985554573149, + "learning_rate": 9.038667327385664e-06, + "loss": 0.4133, + "step": 1602 + }, + { + "epoch": 0.22482468443197756, + "grad_norm": 3.708125990882431, + "learning_rate": 9.03732790038566e-06, + "loss": 0.4195, + "step": 1603 + }, + { + "epoch": 0.22496493688639552, + "grad_norm": 2.197735192880922, + "learning_rate": 9.03598764031125e-06, + "loss": 0.4185, + "step": 1604 + }, + { + "epoch": 0.22510518934081347, + "grad_norm": 1.8357170459605696, + "learning_rate": 9.034646547438987e-06, + "loss": 0.3766, + "step": 1605 + }, + { + "epoch": 0.22524544179523143, + "grad_norm": 2.702345542763752, + "learning_rate": 9.033304622045597e-06, + "loss": 0.4174, + "step": 1606 + }, + { + "epoch": 0.22538569424964938, + "grad_norm": 2.1375341446140386, + "learning_rate": 9.03196186440798e-06, + "loss": 0.3466, + "step": 1607 + }, + { + "epoch": 0.2255259467040673, + "grad_norm": 1.946357844506364, + "learning_rate": 9.0306182748032e-06, + "loss": 0.4189, + "step": 1608 + }, + { + "epoch": 0.22566619915848526, + "grad_norm": 1.8715071251397501, + "learning_rate": 9.029273853508498e-06, + "loss": 0.4265, + "step": 1609 + }, + { + "epoch": 0.22580645161290322, + "grad_norm": 2.531521347007128, + "learning_rate": 9.027928600801288e-06, + "loss": 0.4044, + "step": 1610 + }, + { + "epoch": 0.22594670406732117, + "grad_norm": 2.910527844354641, + "learning_rate": 9.026582516959153e-06, + "loss": 0.3976, + "step": 1611 + }, + { + "epoch": 0.22608695652173913, + "grad_norm": 2.219301289586148, + "learning_rate": 9.025235602259848e-06, + "loss": 0.332, + "step": 1612 + }, + { + "epoch": 0.22622720897615708, + "grad_norm": 3.07740054827471, + "learning_rate": 9.023887856981298e-06, + "loss": 0.3935, + "step": 1613 + }, + { + "epoch": 0.22636746143057504, + "grad_norm": 3.6821033622079256, + "learning_rate": 9.022539281401601e-06, + "loss": 0.4112, + "step": 1614 + }, + { + "epoch": 0.226507713884993, + "grad_norm": 3.452672592147198, + "learning_rate": 9.021189875799027e-06, + "loss": 0.4322, + "step": 1615 + }, + { + "epoch": 0.22664796633941095, + "grad_norm": 2.746822775907403, + "learning_rate": 9.019839640452018e-06, + "loss": 0.3953, + "step": 1616 + }, + { + "epoch": 0.2267882187938289, + "grad_norm": 1.9746478913698404, + "learning_rate": 9.018488575639184e-06, + "loss": 0.4149, + "step": 1617 + }, + { + "epoch": 0.22692847124824683, + "grad_norm": 1.9926586715909664, + "learning_rate": 9.017136681639307e-06, + "loss": 0.3755, + "step": 1618 + }, + { + "epoch": 0.2270687237026648, + "grad_norm": 2.296329251802608, + "learning_rate": 9.01578395873134e-06, + "loss": 0.3892, + "step": 1619 + }, + { + "epoch": 0.22720897615708274, + "grad_norm": 1.9822364156630172, + "learning_rate": 9.014430407194413e-06, + "loss": 0.4096, + "step": 1620 + }, + { + "epoch": 0.2273492286115007, + "grad_norm": 2.296627732026712, + "learning_rate": 9.013076027307817e-06, + "loss": 0.4187, + "step": 1621 + }, + { + "epoch": 0.22748948106591865, + "grad_norm": 1.9803310248586163, + "learning_rate": 9.01172081935102e-06, + "loss": 0.3624, + "step": 1622 + }, + { + "epoch": 0.2276297335203366, + "grad_norm": 2.3947345591325178, + "learning_rate": 9.01036478360366e-06, + "loss": 0.4036, + "step": 1623 + }, + { + "epoch": 0.22776998597475456, + "grad_norm": 2.034718296132587, + "learning_rate": 9.009007920345547e-06, + "loss": 0.3788, + "step": 1624 + }, + { + "epoch": 0.22791023842917252, + "grad_norm": 2.2989757039967036, + "learning_rate": 9.007650229856658e-06, + "loss": 0.3868, + "step": 1625 + }, + { + "epoch": 0.22805049088359047, + "grad_norm": 1.897762790922765, + "learning_rate": 9.006291712417143e-06, + "loss": 0.4124, + "step": 1626 + }, + { + "epoch": 0.22819074333800843, + "grad_norm": 2.0790352799585916, + "learning_rate": 9.004932368307324e-06, + "loss": 0.3878, + "step": 1627 + }, + { + "epoch": 0.22833099579242636, + "grad_norm": 4.544013417412197, + "learning_rate": 9.00357219780769e-06, + "loss": 0.4067, + "step": 1628 + }, + { + "epoch": 0.2284712482468443, + "grad_norm": 2.85240733551978, + "learning_rate": 9.002211201198906e-06, + "loss": 0.349, + "step": 1629 + }, + { + "epoch": 0.22861150070126227, + "grad_norm": 3.3276788116949794, + "learning_rate": 9.000849378761802e-06, + "loss": 0.3851, + "step": 1630 + }, + { + "epoch": 0.22875175315568022, + "grad_norm": 1.88933222685364, + "learning_rate": 8.99948673077738e-06, + "loss": 0.419, + "step": 1631 + }, + { + "epoch": 0.22889200561009818, + "grad_norm": 2.0304971326757504, + "learning_rate": 8.998123257526814e-06, + "loss": 0.3542, + "step": 1632 + }, + { + "epoch": 0.22903225806451613, + "grad_norm": 2.4956064718711297, + "learning_rate": 8.996758959291447e-06, + "loss": 0.4525, + "step": 1633 + }, + { + "epoch": 0.22917251051893409, + "grad_norm": 2.8405366285823304, + "learning_rate": 8.995393836352793e-06, + "loss": 0.3752, + "step": 1634 + }, + { + "epoch": 0.22931276297335204, + "grad_norm": 2.260088133486008, + "learning_rate": 8.994027888992533e-06, + "loss": 0.4087, + "step": 1635 + }, + { + "epoch": 0.22945301542777, + "grad_norm": 1.9000110776477925, + "learning_rate": 8.992661117492526e-06, + "loss": 0.3813, + "step": 1636 + }, + { + "epoch": 0.22959326788218795, + "grad_norm": 2.5853449669875075, + "learning_rate": 8.991293522134789e-06, + "loss": 0.3683, + "step": 1637 + }, + { + "epoch": 0.22973352033660588, + "grad_norm": 2.003331747112609, + "learning_rate": 8.98992510320152e-06, + "loss": 0.3777, + "step": 1638 + }, + { + "epoch": 0.22987377279102383, + "grad_norm": 2.5589198712724035, + "learning_rate": 8.988555860975082e-06, + "loss": 0.4048, + "step": 1639 + }, + { + "epoch": 0.2300140252454418, + "grad_norm": 2.2356720474236806, + "learning_rate": 8.987185795738007e-06, + "loss": 0.4177, + "step": 1640 + }, + { + "epoch": 0.23015427769985974, + "grad_norm": 2.441388183526904, + "learning_rate": 8.985814907773004e-06, + "loss": 0.4235, + "step": 1641 + }, + { + "epoch": 0.2302945301542777, + "grad_norm": 2.151370068867226, + "learning_rate": 8.984443197362938e-06, + "loss": 0.3803, + "step": 1642 + }, + { + "epoch": 0.23043478260869565, + "grad_norm": 2.267550367239667, + "learning_rate": 8.983070664790856e-06, + "loss": 0.4109, + "step": 1643 + }, + { + "epoch": 0.2305750350631136, + "grad_norm": 2.1053224417219716, + "learning_rate": 8.981697310339972e-06, + "loss": 0.3954, + "step": 1644 + }, + { + "epoch": 0.23071528751753156, + "grad_norm": 1.7570527872179347, + "learning_rate": 8.980323134293664e-06, + "loss": 0.3774, + "step": 1645 + }, + { + "epoch": 0.23085553997194952, + "grad_norm": 2.434043291677052, + "learning_rate": 8.978948136935488e-06, + "loss": 0.381, + "step": 1646 + }, + { + "epoch": 0.23099579242636747, + "grad_norm": 1.9277178638991808, + "learning_rate": 8.977572318549164e-06, + "loss": 0.3794, + "step": 1647 + }, + { + "epoch": 0.2311360448807854, + "grad_norm": 2.8909639726819867, + "learning_rate": 8.97619567941858e-06, + "loss": 0.3999, + "step": 1648 + }, + { + "epoch": 0.23127629733520336, + "grad_norm": 1.8991705338741542, + "learning_rate": 8.974818219827796e-06, + "loss": 0.3801, + "step": 1649 + }, + { + "epoch": 0.2314165497896213, + "grad_norm": 2.3324845223357937, + "learning_rate": 8.973439940061044e-06, + "loss": 0.3954, + "step": 1650 + }, + { + "epoch": 0.23155680224403927, + "grad_norm": 2.116780146863398, + "learning_rate": 8.972060840402721e-06, + "loss": 0.3853, + "step": 1651 + }, + { + "epoch": 0.23169705469845722, + "grad_norm": 2.162627036712677, + "learning_rate": 8.970680921137396e-06, + "loss": 0.387, + "step": 1652 + }, + { + "epoch": 0.23183730715287518, + "grad_norm": 1.992855244346062, + "learning_rate": 8.969300182549802e-06, + "loss": 0.3881, + "step": 1653 + }, + { + "epoch": 0.23197755960729313, + "grad_norm": 1.900349850819968, + "learning_rate": 8.967918624924849e-06, + "loss": 0.4256, + "step": 1654 + }, + { + "epoch": 0.2321178120617111, + "grad_norm": 1.9903624907971762, + "learning_rate": 8.966536248547608e-06, + "loss": 0.4404, + "step": 1655 + }, + { + "epoch": 0.23225806451612904, + "grad_norm": 1.8558296575397093, + "learning_rate": 8.965153053703325e-06, + "loss": 0.4001, + "step": 1656 + }, + { + "epoch": 0.232398316970547, + "grad_norm": 1.9107665040809345, + "learning_rate": 8.963769040677413e-06, + "loss": 0.4166, + "step": 1657 + }, + { + "epoch": 0.23253856942496492, + "grad_norm": 2.299496562426827, + "learning_rate": 8.962384209755453e-06, + "loss": 0.4118, + "step": 1658 + }, + { + "epoch": 0.23267882187938288, + "grad_norm": 2.2810366805161437, + "learning_rate": 8.960998561223193e-06, + "loss": 0.3915, + "step": 1659 + }, + { + "epoch": 0.23281907433380084, + "grad_norm": 2.0407852553902295, + "learning_rate": 8.959612095366556e-06, + "loss": 0.3591, + "step": 1660 + }, + { + "epoch": 0.2329593267882188, + "grad_norm": 2.159094109601291, + "learning_rate": 8.958224812471625e-06, + "loss": 0.3823, + "step": 1661 + }, + { + "epoch": 0.23309957924263675, + "grad_norm": 2.0192802465057254, + "learning_rate": 8.95683671282466e-06, + "loss": 0.4396, + "step": 1662 + }, + { + "epoch": 0.2332398316970547, + "grad_norm": 2.8927017587310693, + "learning_rate": 8.955447796712083e-06, + "loss": 0.3687, + "step": 1663 + }, + { + "epoch": 0.23338008415147266, + "grad_norm": 2.728144582916274, + "learning_rate": 8.954058064420487e-06, + "loss": 0.4356, + "step": 1664 + }, + { + "epoch": 0.2335203366058906, + "grad_norm": 1.8896194179780765, + "learning_rate": 8.952667516236635e-06, + "loss": 0.3628, + "step": 1665 + }, + { + "epoch": 0.23366058906030857, + "grad_norm": 2.0293034543324526, + "learning_rate": 8.951276152447458e-06, + "loss": 0.41, + "step": 1666 + }, + { + "epoch": 0.23380084151472652, + "grad_norm": 2.2500371929909355, + "learning_rate": 8.949883973340051e-06, + "loss": 0.3951, + "step": 1667 + }, + { + "epoch": 0.23394109396914445, + "grad_norm": 2.3940381387144067, + "learning_rate": 8.948490979201683e-06, + "loss": 0.4154, + "step": 1668 + }, + { + "epoch": 0.2340813464235624, + "grad_norm": 1.7812550193314356, + "learning_rate": 8.947097170319789e-06, + "loss": 0.4004, + "step": 1669 + }, + { + "epoch": 0.23422159887798036, + "grad_norm": 2.1472347707684687, + "learning_rate": 8.94570254698197e-06, + "loss": 0.4362, + "step": 1670 + }, + { + "epoch": 0.2343618513323983, + "grad_norm": 2.2844318513279873, + "learning_rate": 8.944307109475996e-06, + "loss": 0.3723, + "step": 1671 + }, + { + "epoch": 0.23450210378681627, + "grad_norm": 3.0805714755788256, + "learning_rate": 8.942910858089806e-06, + "loss": 0.4107, + "step": 1672 + }, + { + "epoch": 0.23464235624123422, + "grad_norm": 1.9922744432812676, + "learning_rate": 8.94151379311151e-06, + "loss": 0.3842, + "step": 1673 + }, + { + "epoch": 0.23478260869565218, + "grad_norm": 3.341890192206006, + "learning_rate": 8.940115914829382e-06, + "loss": 0.4123, + "step": 1674 + }, + { + "epoch": 0.23492286115007013, + "grad_norm": 3.807837715425462, + "learning_rate": 8.93871722353186e-06, + "loss": 0.4424, + "step": 1675 + }, + { + "epoch": 0.2350631136044881, + "grad_norm": 2.428124713283998, + "learning_rate": 8.937317719507556e-06, + "loss": 0.3734, + "step": 1676 + }, + { + "epoch": 0.23520336605890604, + "grad_norm": 1.8818099649138518, + "learning_rate": 8.935917403045251e-06, + "loss": 0.4008, + "step": 1677 + }, + { + "epoch": 0.23534361851332397, + "grad_norm": 1.8918695058197808, + "learning_rate": 8.934516274433889e-06, + "loss": 0.4031, + "step": 1678 + }, + { + "epoch": 0.23548387096774193, + "grad_norm": 1.8727454936470198, + "learning_rate": 8.93311433396258e-06, + "loss": 0.3772, + "step": 1679 + }, + { + "epoch": 0.23562412342215988, + "grad_norm": 1.8489457322528533, + "learning_rate": 8.93171158192061e-06, + "loss": 0.3969, + "step": 1680 + }, + { + "epoch": 0.23576437587657784, + "grad_norm": 1.782463329735169, + "learning_rate": 8.930308018597422e-06, + "loss": 0.3341, + "step": 1681 + }, + { + "epoch": 0.2359046283309958, + "grad_norm": 2.447395680496483, + "learning_rate": 8.928903644282635e-06, + "loss": 0.4359, + "step": 1682 + }, + { + "epoch": 0.23604488078541375, + "grad_norm": 1.724058035035235, + "learning_rate": 8.92749845926603e-06, + "loss": 0.4187, + "step": 1683 + }, + { + "epoch": 0.2361851332398317, + "grad_norm": 2.5647117559867008, + "learning_rate": 8.926092463837557e-06, + "loss": 0.3799, + "step": 1684 + }, + { + "epoch": 0.23632538569424966, + "grad_norm": 1.9698294450542004, + "learning_rate": 8.924685658287334e-06, + "loss": 0.3827, + "step": 1685 + }, + { + "epoch": 0.2364656381486676, + "grad_norm": 2.074864748487916, + "learning_rate": 8.923278042905647e-06, + "loss": 0.4297, + "step": 1686 + }, + { + "epoch": 0.23660589060308557, + "grad_norm": 1.9723837484857996, + "learning_rate": 8.921869617982945e-06, + "loss": 0.4084, + "step": 1687 + }, + { + "epoch": 0.2367461430575035, + "grad_norm": 2.181385103867383, + "learning_rate": 8.920460383809847e-06, + "loss": 0.4329, + "step": 1688 + }, + { + "epoch": 0.23688639551192145, + "grad_norm": 1.458910836813871, + "learning_rate": 8.91905034067714e-06, + "loss": 0.3816, + "step": 1689 + }, + { + "epoch": 0.2370266479663394, + "grad_norm": 2.040082303799142, + "learning_rate": 8.917639488875776e-06, + "loss": 0.3957, + "step": 1690 + }, + { + "epoch": 0.23716690042075736, + "grad_norm": 2.0712335804833493, + "learning_rate": 8.916227828696873e-06, + "loss": 0.369, + "step": 1691 + }, + { + "epoch": 0.23730715287517531, + "grad_norm": 2.2525521744624557, + "learning_rate": 8.91481536043172e-06, + "loss": 0.4085, + "step": 1692 + }, + { + "epoch": 0.23744740532959327, + "grad_norm": 2.846440143223222, + "learning_rate": 8.913402084371767e-06, + "loss": 0.3944, + "step": 1693 + }, + { + "epoch": 0.23758765778401122, + "grad_norm": 2.1343981539139207, + "learning_rate": 8.911988000808636e-06, + "loss": 0.4309, + "step": 1694 + }, + { + "epoch": 0.23772791023842918, + "grad_norm": 1.7394265505861504, + "learning_rate": 8.910573110034113e-06, + "loss": 0.3643, + "step": 1695 + }, + { + "epoch": 0.23786816269284713, + "grad_norm": 1.9952329209006225, + "learning_rate": 8.90915741234015e-06, + "loss": 0.3924, + "step": 1696 + }, + { + "epoch": 0.2380084151472651, + "grad_norm": 3.0299040962376123, + "learning_rate": 8.907740908018866e-06, + "loss": 0.3975, + "step": 1697 + }, + { + "epoch": 0.23814866760168302, + "grad_norm": 1.9531878041660242, + "learning_rate": 8.906323597362547e-06, + "loss": 0.4184, + "step": 1698 + }, + { + "epoch": 0.23828892005610097, + "grad_norm": 2.2203150857243257, + "learning_rate": 8.904905480663646e-06, + "loss": 0.4144, + "step": 1699 + }, + { + "epoch": 0.23842917251051893, + "grad_norm": 2.222344294480045, + "learning_rate": 8.90348655821478e-06, + "loss": 0.378, + "step": 1700 + }, + { + "epoch": 0.23856942496493688, + "grad_norm": 2.1053773651390606, + "learning_rate": 8.902066830308735e-06, + "loss": 0.4399, + "step": 1701 + }, + { + "epoch": 0.23870967741935484, + "grad_norm": 3.228736865245466, + "learning_rate": 8.900646297238462e-06, + "loss": 0.3865, + "step": 1702 + }, + { + "epoch": 0.2388499298737728, + "grad_norm": 2.168370774943011, + "learning_rate": 8.899224959297078e-06, + "loss": 0.404, + "step": 1703 + }, + { + "epoch": 0.23899018232819075, + "grad_norm": 4.8979443266863845, + "learning_rate": 8.897802816777866e-06, + "loss": 0.4161, + "step": 1704 + }, + { + "epoch": 0.2391304347826087, + "grad_norm": 2.6500597461227207, + "learning_rate": 8.896379869974273e-06, + "loss": 0.4087, + "step": 1705 + }, + { + "epoch": 0.23927068723702666, + "grad_norm": 3.057828649167323, + "learning_rate": 8.894956119179918e-06, + "loss": 0.4235, + "step": 1706 + }, + { + "epoch": 0.2394109396914446, + "grad_norm": 2.5599749149643207, + "learning_rate": 8.89353156468858e-06, + "loss": 0.387, + "step": 1707 + }, + { + "epoch": 0.23955119214586254, + "grad_norm": 2.6350365365841326, + "learning_rate": 8.892106206794204e-06, + "loss": 0.3877, + "step": 1708 + }, + { + "epoch": 0.2396914446002805, + "grad_norm": 3.1900285153923176, + "learning_rate": 8.890680045790907e-06, + "loss": 0.4292, + "step": 1709 + }, + { + "epoch": 0.23983169705469845, + "grad_norm": 2.82073413518562, + "learning_rate": 8.889253081972963e-06, + "loss": 0.4004, + "step": 1710 + }, + { + "epoch": 0.2399719495091164, + "grad_norm": 1.8333597470316982, + "learning_rate": 8.88782531563482e-06, + "loss": 0.3766, + "step": 1711 + }, + { + "epoch": 0.24011220196353436, + "grad_norm": 2.232362067221606, + "learning_rate": 8.886396747071085e-06, + "loss": 0.4563, + "step": 1712 + }, + { + "epoch": 0.24025245441795232, + "grad_norm": 2.48197667701577, + "learning_rate": 8.884967376576534e-06, + "loss": 0.4349, + "step": 1713 + }, + { + "epoch": 0.24039270687237027, + "grad_norm": 2.744797988295589, + "learning_rate": 8.883537204446105e-06, + "loss": 0.4015, + "step": 1714 + }, + { + "epoch": 0.24053295932678823, + "grad_norm": 2.4051826388667767, + "learning_rate": 8.88210623097491e-06, + "loss": 0.3699, + "step": 1715 + }, + { + "epoch": 0.24067321178120618, + "grad_norm": 2.164231166394999, + "learning_rate": 8.880674456458214e-06, + "loss": 0.4416, + "step": 1716 + }, + { + "epoch": 0.24081346423562414, + "grad_norm": 2.5652527328817194, + "learning_rate": 8.879241881191458e-06, + "loss": 0.368, + "step": 1717 + }, + { + "epoch": 0.24095371669004206, + "grad_norm": 1.7471053843339004, + "learning_rate": 8.877808505470242e-06, + "loss": 0.3543, + "step": 1718 + }, + { + "epoch": 0.24109396914446002, + "grad_norm": 2.8804187295417862, + "learning_rate": 8.876374329590331e-06, + "loss": 0.3405, + "step": 1719 + }, + { + "epoch": 0.24123422159887797, + "grad_norm": 2.2787610415443846, + "learning_rate": 8.874939353847662e-06, + "loss": 0.3809, + "step": 1720 + }, + { + "epoch": 0.24137447405329593, + "grad_norm": 3.084608192479369, + "learning_rate": 8.87350357853833e-06, + "loss": 0.4129, + "step": 1721 + }, + { + "epoch": 0.24151472650771388, + "grad_norm": 5.009866706765327, + "learning_rate": 8.872067003958597e-06, + "loss": 0.4014, + "step": 1722 + }, + { + "epoch": 0.24165497896213184, + "grad_norm": 1.772390518681213, + "learning_rate": 8.87062963040489e-06, + "loss": 0.4027, + "step": 1723 + }, + { + "epoch": 0.2417952314165498, + "grad_norm": 2.332087377131572, + "learning_rate": 8.869191458173801e-06, + "loss": 0.3496, + "step": 1724 + }, + { + "epoch": 0.24193548387096775, + "grad_norm": 2.1971708963955283, + "learning_rate": 8.867752487562087e-06, + "loss": 0.3459, + "step": 1725 + }, + { + "epoch": 0.2420757363253857, + "grad_norm": 3.087015702415595, + "learning_rate": 8.866312718866669e-06, + "loss": 0.3949, + "step": 1726 + }, + { + "epoch": 0.24221598877980366, + "grad_norm": 2.1088482234309263, + "learning_rate": 8.864872152384635e-06, + "loss": 0.4081, + "step": 1727 + }, + { + "epoch": 0.2423562412342216, + "grad_norm": 2.2054834390354463, + "learning_rate": 8.863430788413232e-06, + "loss": 0.3819, + "step": 1728 + }, + { + "epoch": 0.24249649368863954, + "grad_norm": 3.2512741525611624, + "learning_rate": 8.86198862724988e-06, + "loss": 0.3699, + "step": 1729 + }, + { + "epoch": 0.2426367461430575, + "grad_norm": 3.234062142836094, + "learning_rate": 8.860545669192155e-06, + "loss": 0.3883, + "step": 1730 + }, + { + "epoch": 0.24277699859747545, + "grad_norm": 2.039208885579974, + "learning_rate": 8.859101914537804e-06, + "loss": 0.3905, + "step": 1731 + }, + { + "epoch": 0.2429172510518934, + "grad_norm": 2.293178672214395, + "learning_rate": 8.857657363584736e-06, + "loss": 0.3662, + "step": 1732 + }, + { + "epoch": 0.24305750350631136, + "grad_norm": 2.0828516848427414, + "learning_rate": 8.85621201663102e-06, + "loss": 0.3828, + "step": 1733 + }, + { + "epoch": 0.24319775596072932, + "grad_norm": 2.207938704828507, + "learning_rate": 8.854765873974898e-06, + "loss": 0.4198, + "step": 1734 + }, + { + "epoch": 0.24333800841514727, + "grad_norm": 2.9174165215925933, + "learning_rate": 8.85331893591477e-06, + "loss": 0.4653, + "step": 1735 + }, + { + "epoch": 0.24347826086956523, + "grad_norm": 2.2664337819270406, + "learning_rate": 8.851871202749201e-06, + "loss": 0.3783, + "step": 1736 + }, + { + "epoch": 0.24361851332398318, + "grad_norm": 2.146807353857163, + "learning_rate": 8.850422674776918e-06, + "loss": 0.3834, + "step": 1737 + }, + { + "epoch": 0.2437587657784011, + "grad_norm": 3.4281839537789196, + "learning_rate": 8.84897335229682e-06, + "loss": 0.3939, + "step": 1738 + }, + { + "epoch": 0.24389901823281906, + "grad_norm": 2.4417787801718664, + "learning_rate": 8.84752323560796e-06, + "loss": 0.4119, + "step": 1739 + }, + { + "epoch": 0.24403927068723702, + "grad_norm": 2.0966569448871355, + "learning_rate": 8.846072325009562e-06, + "loss": 0.3511, + "step": 1740 + }, + { + "epoch": 0.24417952314165497, + "grad_norm": 2.258088197342238, + "learning_rate": 8.84462062080101e-06, + "loss": 0.4303, + "step": 1741 + }, + { + "epoch": 0.24431977559607293, + "grad_norm": 2.5054597364976625, + "learning_rate": 8.843168123281855e-06, + "loss": 0.3977, + "step": 1742 + }, + { + "epoch": 0.24446002805049089, + "grad_norm": 3.92578222170386, + "learning_rate": 8.841714832751806e-06, + "loss": 0.4287, + "step": 1743 + }, + { + "epoch": 0.24460028050490884, + "grad_norm": 2.6346756769185817, + "learning_rate": 8.840260749510744e-06, + "loss": 0.4227, + "step": 1744 + }, + { + "epoch": 0.2447405329593268, + "grad_norm": 2.7527575971896794, + "learning_rate": 8.838805873858704e-06, + "loss": 0.4165, + "step": 1745 + }, + { + "epoch": 0.24488078541374475, + "grad_norm": 2.2262771155854875, + "learning_rate": 8.837350206095894e-06, + "loss": 0.3821, + "step": 1746 + }, + { + "epoch": 0.2450210378681627, + "grad_norm": 2.6176660811848116, + "learning_rate": 8.83589374652268e-06, + "loss": 0.4032, + "step": 1747 + }, + { + "epoch": 0.24516129032258063, + "grad_norm": 2.314385959600325, + "learning_rate": 8.834436495439588e-06, + "loss": 0.4427, + "step": 1748 + }, + { + "epoch": 0.2453015427769986, + "grad_norm": 2.089073195961481, + "learning_rate": 8.832978453147316e-06, + "loss": 0.3704, + "step": 1749 + }, + { + "epoch": 0.24544179523141654, + "grad_norm": 3.0954830055318703, + "learning_rate": 8.83151961994672e-06, + "loss": 0.4065, + "step": 1750 + }, + { + "epoch": 0.2455820476858345, + "grad_norm": 3.7937963743690544, + "learning_rate": 8.830059996138818e-06, + "loss": 0.388, + "step": 1751 + }, + { + "epoch": 0.24572230014025245, + "grad_norm": 2.2588277995692123, + "learning_rate": 8.828599582024794e-06, + "loss": 0.4209, + "step": 1752 + }, + { + "epoch": 0.2458625525946704, + "grad_norm": 2.246356981857827, + "learning_rate": 8.827138377905999e-06, + "loss": 0.413, + "step": 1753 + }, + { + "epoch": 0.24600280504908836, + "grad_norm": 2.2246019800346253, + "learning_rate": 8.825676384083936e-06, + "loss": 0.3786, + "step": 1754 + }, + { + "epoch": 0.24614305750350632, + "grad_norm": 2.3258634502119815, + "learning_rate": 8.824213600860278e-06, + "loss": 0.4072, + "step": 1755 + }, + { + "epoch": 0.24628330995792427, + "grad_norm": 2.453444441874324, + "learning_rate": 8.822750028536863e-06, + "loss": 0.4087, + "step": 1756 + }, + { + "epoch": 0.24642356241234223, + "grad_norm": 2.850109325788278, + "learning_rate": 8.821285667415688e-06, + "loss": 0.4049, + "step": 1757 + }, + { + "epoch": 0.24656381486676016, + "grad_norm": 5.25191844220837, + "learning_rate": 8.819820517798911e-06, + "loss": 0.3922, + "step": 1758 + }, + { + "epoch": 0.2467040673211781, + "grad_norm": 2.8776234805924714, + "learning_rate": 8.81835457998886e-06, + "loss": 0.3401, + "step": 1759 + }, + { + "epoch": 0.24684431977559607, + "grad_norm": 2.72167408110743, + "learning_rate": 8.816887854288018e-06, + "loss": 0.3746, + "step": 1760 + }, + { + "epoch": 0.24698457223001402, + "grad_norm": 2.03635211742382, + "learning_rate": 8.815420340999034e-06, + "loss": 0.3548, + "step": 1761 + }, + { + "epoch": 0.24712482468443198, + "grad_norm": 2.228948017936398, + "learning_rate": 8.813952040424718e-06, + "loss": 0.3908, + "step": 1762 + }, + { + "epoch": 0.24726507713884993, + "grad_norm": 2.272067361739735, + "learning_rate": 8.812482952868047e-06, + "loss": 0.3677, + "step": 1763 + }, + { + "epoch": 0.2474053295932679, + "grad_norm": 2.5296451520290995, + "learning_rate": 8.811013078632154e-06, + "loss": 0.3995, + "step": 1764 + }, + { + "epoch": 0.24754558204768584, + "grad_norm": 3.1448746961609175, + "learning_rate": 8.809542418020335e-06, + "loss": 0.4081, + "step": 1765 + }, + { + "epoch": 0.2476858345021038, + "grad_norm": 1.991928323080467, + "learning_rate": 8.808070971336058e-06, + "loss": 0.4135, + "step": 1766 + }, + { + "epoch": 0.24782608695652175, + "grad_norm": 2.233814729458205, + "learning_rate": 8.80659873888294e-06, + "loss": 0.3954, + "step": 1767 + }, + { + "epoch": 0.24796633941093968, + "grad_norm": 2.0783321879653203, + "learning_rate": 8.805125720964766e-06, + "loss": 0.4052, + "step": 1768 + }, + { + "epoch": 0.24810659186535763, + "grad_norm": 2.197283528572893, + "learning_rate": 8.803651917885486e-06, + "loss": 0.4264, + "step": 1769 + }, + { + "epoch": 0.2482468443197756, + "grad_norm": 2.3898712166601066, + "learning_rate": 8.802177329949205e-06, + "loss": 0.3857, + "step": 1770 + }, + { + "epoch": 0.24838709677419354, + "grad_norm": 2.399110581578432, + "learning_rate": 8.800701957460199e-06, + "loss": 0.4092, + "step": 1771 + }, + { + "epoch": 0.2485273492286115, + "grad_norm": 1.976417144134966, + "learning_rate": 8.799225800722895e-06, + "loss": 0.4004, + "step": 1772 + }, + { + "epoch": 0.24866760168302945, + "grad_norm": 3.164961623551263, + "learning_rate": 8.797748860041891e-06, + "loss": 0.4035, + "step": 1773 + }, + { + "epoch": 0.2488078541374474, + "grad_norm": 2.3482842898566494, + "learning_rate": 8.796271135721944e-06, + "loss": 0.3948, + "step": 1774 + }, + { + "epoch": 0.24894810659186536, + "grad_norm": 1.6693538301954394, + "learning_rate": 8.79479262806797e-06, + "loss": 0.4039, + "step": 1775 + }, + { + "epoch": 0.24908835904628332, + "grad_norm": 2.7814138985678643, + "learning_rate": 8.79331333738505e-06, + "loss": 0.4242, + "step": 1776 + }, + { + "epoch": 0.24922861150070127, + "grad_norm": 2.2315645482366797, + "learning_rate": 8.791833263978426e-06, + "loss": 0.3643, + "step": 1777 + }, + { + "epoch": 0.2493688639551192, + "grad_norm": 2.0523285178732666, + "learning_rate": 8.7903524081535e-06, + "loss": 0.403, + "step": 1778 + }, + { + "epoch": 0.24950911640953716, + "grad_norm": 2.436298759054723, + "learning_rate": 8.788870770215835e-06, + "loss": 0.4465, + "step": 1779 + }, + { + "epoch": 0.2496493688639551, + "grad_norm": 2.5515507564052506, + "learning_rate": 8.787388350471158e-06, + "loss": 0.3948, + "step": 1780 + }, + { + "epoch": 0.24978962131837307, + "grad_norm": 2.25264055226364, + "learning_rate": 8.785905149225356e-06, + "loss": 0.4201, + "step": 1781 + }, + { + "epoch": 0.24992987377279102, + "grad_norm": 1.8616700954058476, + "learning_rate": 8.784421166784476e-06, + "loss": 0.4022, + "step": 1782 + }, + { + "epoch": 0.250070126227209, + "grad_norm": 3.0806936013153834, + "learning_rate": 8.782936403454729e-06, + "loss": 0.4205, + "step": 1783 + }, + { + "epoch": 0.25021037868162693, + "grad_norm": 2.1931853390924596, + "learning_rate": 8.781450859542484e-06, + "loss": 0.3928, + "step": 1784 + }, + { + "epoch": 0.2503506311360449, + "grad_norm": 2.413569131452635, + "learning_rate": 8.779964535354274e-06, + "loss": 0.374, + "step": 1785 + }, + { + "epoch": 0.25049088359046284, + "grad_norm": 2.478009977232865, + "learning_rate": 8.778477431196792e-06, + "loss": 0.4174, + "step": 1786 + }, + { + "epoch": 0.2506311360448808, + "grad_norm": 1.8967975707488671, + "learning_rate": 8.77698954737689e-06, + "loss": 0.3483, + "step": 1787 + }, + { + "epoch": 0.25077138849929875, + "grad_norm": 1.7110772620124224, + "learning_rate": 8.775500884201582e-06, + "loss": 0.3989, + "step": 1788 + }, + { + "epoch": 0.2509116409537167, + "grad_norm": 2.1189182991094255, + "learning_rate": 8.774011441978046e-06, + "loss": 0.4062, + "step": 1789 + }, + { + "epoch": 0.25105189340813466, + "grad_norm": 2.0172415468287266, + "learning_rate": 8.772521221013615e-06, + "loss": 0.4258, + "step": 1790 + }, + { + "epoch": 0.2511921458625526, + "grad_norm": 2.2282825986706833, + "learning_rate": 8.771030221615786e-06, + "loss": 0.4526, + "step": 1791 + }, + { + "epoch": 0.2513323983169706, + "grad_norm": 2.1082367262990678, + "learning_rate": 8.769538444092219e-06, + "loss": 0.4004, + "step": 1792 + }, + { + "epoch": 0.2514726507713885, + "grad_norm": 2.023116627588351, + "learning_rate": 8.768045888750729e-06, + "loss": 0.3842, + "step": 1793 + }, + { + "epoch": 0.25161290322580643, + "grad_norm": 2.230766423499367, + "learning_rate": 8.766552555899297e-06, + "loss": 0.4112, + "step": 1794 + }, + { + "epoch": 0.2517531556802244, + "grad_norm": 12.372882700420277, + "learning_rate": 8.76505844584606e-06, + "loss": 0.385, + "step": 1795 + }, + { + "epoch": 0.25189340813464234, + "grad_norm": 2.230432299635121, + "learning_rate": 8.763563558899317e-06, + "loss": 0.3769, + "step": 1796 + }, + { + "epoch": 0.2520336605890603, + "grad_norm": 2.3401758025226753, + "learning_rate": 8.762067895367527e-06, + "loss": 0.4169, + "step": 1797 + }, + { + "epoch": 0.25217391304347825, + "grad_norm": 2.3017206265889416, + "learning_rate": 8.760571455559313e-06, + "loss": 0.3613, + "step": 1798 + }, + { + "epoch": 0.2523141654978962, + "grad_norm": 1.8519542939496811, + "learning_rate": 8.759074239783451e-06, + "loss": 0.3539, + "step": 1799 + }, + { + "epoch": 0.25245441795231416, + "grad_norm": 1.795128811435546, + "learning_rate": 8.757576248348883e-06, + "loss": 0.4556, + "step": 1800 + }, + { + "epoch": 0.2525946704067321, + "grad_norm": 2.5662300130639015, + "learning_rate": 8.756077481564708e-06, + "loss": 0.3916, + "step": 1801 + }, + { + "epoch": 0.25273492286115007, + "grad_norm": 2.0147116314161866, + "learning_rate": 8.754577939740188e-06, + "loss": 0.4047, + "step": 1802 + }, + { + "epoch": 0.252875175315568, + "grad_norm": 2.114643553328905, + "learning_rate": 8.75307762318474e-06, + "loss": 0.4217, + "step": 1803 + }, + { + "epoch": 0.253015427769986, + "grad_norm": 2.1305930638178814, + "learning_rate": 8.751576532207947e-06, + "loss": 0.3663, + "step": 1804 + }, + { + "epoch": 0.25315568022440393, + "grad_norm": 2.031597870602574, + "learning_rate": 8.750074667119546e-06, + "loss": 0.4082, + "step": 1805 + }, + { + "epoch": 0.2532959326788219, + "grad_norm": 2.1795838033969406, + "learning_rate": 8.748572028229438e-06, + "loss": 0.3633, + "step": 1806 + }, + { + "epoch": 0.25343618513323984, + "grad_norm": 1.9731163168260866, + "learning_rate": 8.747068615847683e-06, + "loss": 0.3441, + "step": 1807 + }, + { + "epoch": 0.2535764375876578, + "grad_norm": 2.18130979701933, + "learning_rate": 8.745564430284495e-06, + "loss": 0.434, + "step": 1808 + }, + { + "epoch": 0.25371669004207575, + "grad_norm": 2.289373799791576, + "learning_rate": 8.744059471850258e-06, + "loss": 0.416, + "step": 1809 + }, + { + "epoch": 0.2538569424964937, + "grad_norm": 2.9871894387706694, + "learning_rate": 8.742553740855507e-06, + "loss": 0.4362, + "step": 1810 + }, + { + "epoch": 0.25399719495091166, + "grad_norm": 2.4547691909532494, + "learning_rate": 8.741047237610938e-06, + "loss": 0.4176, + "step": 1811 + }, + { + "epoch": 0.2541374474053296, + "grad_norm": 2.42179840951222, + "learning_rate": 8.739539962427408e-06, + "loss": 0.4296, + "step": 1812 + }, + { + "epoch": 0.2542776998597475, + "grad_norm": 2.4302580430339495, + "learning_rate": 8.738031915615934e-06, + "loss": 0.3877, + "step": 1813 + }, + { + "epoch": 0.2544179523141655, + "grad_norm": 2.459516423774877, + "learning_rate": 8.736523097487693e-06, + "loss": 0.358, + "step": 1814 + }, + { + "epoch": 0.25455820476858343, + "grad_norm": 1.7997585214126197, + "learning_rate": 8.735013508354012e-06, + "loss": 0.3501, + "step": 1815 + }, + { + "epoch": 0.2546984572230014, + "grad_norm": 2.916415085614378, + "learning_rate": 8.73350314852639e-06, + "loss": 0.4427, + "step": 1816 + }, + { + "epoch": 0.25483870967741934, + "grad_norm": 2.031047400127565, + "learning_rate": 8.731992018316478e-06, + "loss": 0.3813, + "step": 1817 + }, + { + "epoch": 0.2549789621318373, + "grad_norm": 1.7100093187669403, + "learning_rate": 8.730480118036087e-06, + "loss": 0.3838, + "step": 1818 + }, + { + "epoch": 0.25511921458625525, + "grad_norm": 2.324098112678016, + "learning_rate": 8.728967447997185e-06, + "loss": 0.3654, + "step": 1819 + }, + { + "epoch": 0.2552594670406732, + "grad_norm": 1.8097878256938136, + "learning_rate": 8.727454008511905e-06, + "loss": 0.3925, + "step": 1820 + }, + { + "epoch": 0.25539971949509116, + "grad_norm": 2.521438799149979, + "learning_rate": 8.72593979989253e-06, + "loss": 0.3573, + "step": 1821 + }, + { + "epoch": 0.2555399719495091, + "grad_norm": 1.9643452734401718, + "learning_rate": 8.724424822451512e-06, + "loss": 0.3934, + "step": 1822 + }, + { + "epoch": 0.25568022440392707, + "grad_norm": 2.1646705626999068, + "learning_rate": 8.722909076501451e-06, + "loss": 0.3542, + "step": 1823 + }, + { + "epoch": 0.255820476858345, + "grad_norm": 2.1582689646348654, + "learning_rate": 8.721392562355113e-06, + "loss": 0.3713, + "step": 1824 + }, + { + "epoch": 0.255960729312763, + "grad_norm": 2.401757840295143, + "learning_rate": 8.719875280325418e-06, + "loss": 0.4492, + "step": 1825 + }, + { + "epoch": 0.25610098176718094, + "grad_norm": 2.8804397536024426, + "learning_rate": 8.71835723072545e-06, + "loss": 0.3875, + "step": 1826 + }, + { + "epoch": 0.2562412342215989, + "grad_norm": 2.343629249005314, + "learning_rate": 8.716838413868445e-06, + "loss": 0.3836, + "step": 1827 + }, + { + "epoch": 0.25638148667601685, + "grad_norm": 2.16886022985698, + "learning_rate": 8.715318830067801e-06, + "loss": 0.4179, + "step": 1828 + }, + { + "epoch": 0.2565217391304348, + "grad_norm": 2.2775460050356195, + "learning_rate": 8.713798479637073e-06, + "loss": 0.4342, + "step": 1829 + }, + { + "epoch": 0.25666199158485276, + "grad_norm": 2.115526737427353, + "learning_rate": 8.712277362889975e-06, + "loss": 0.4431, + "step": 1830 + }, + { + "epoch": 0.2568022440392707, + "grad_norm": 2.5964463707644923, + "learning_rate": 8.71075548014038e-06, + "loss": 0.4239, + "step": 1831 + }, + { + "epoch": 0.25694249649368867, + "grad_norm": 2.096060431774018, + "learning_rate": 8.709232831702319e-06, + "loss": 0.381, + "step": 1832 + }, + { + "epoch": 0.25708274894810657, + "grad_norm": 2.1111865141715045, + "learning_rate": 8.707709417889975e-06, + "loss": 0.3954, + "step": 1833 + }, + { + "epoch": 0.2572230014025245, + "grad_norm": 1.9854325007327296, + "learning_rate": 8.706185239017699e-06, + "loss": 0.3876, + "step": 1834 + }, + { + "epoch": 0.2573632538569425, + "grad_norm": 2.8123602938391947, + "learning_rate": 8.704660295399991e-06, + "loss": 0.4228, + "step": 1835 + }, + { + "epoch": 0.25750350631136043, + "grad_norm": 1.9994820877879265, + "learning_rate": 8.703134587351514e-06, + "loss": 0.3928, + "step": 1836 + }, + { + "epoch": 0.2576437587657784, + "grad_norm": 1.5917584560883244, + "learning_rate": 8.701608115187087e-06, + "loss": 0.3915, + "step": 1837 + }, + { + "epoch": 0.25778401122019634, + "grad_norm": 1.7824601864213876, + "learning_rate": 8.700080879221689e-06, + "loss": 0.3607, + "step": 1838 + }, + { + "epoch": 0.2579242636746143, + "grad_norm": 1.7969392847918233, + "learning_rate": 8.69855287977045e-06, + "loss": 0.397, + "step": 1839 + }, + { + "epoch": 0.25806451612903225, + "grad_norm": 3.2935680418767985, + "learning_rate": 8.697024117148665e-06, + "loss": 0.4094, + "step": 1840 + }, + { + "epoch": 0.2582047685834502, + "grad_norm": 2.3393088415535592, + "learning_rate": 8.695494591671782e-06, + "loss": 0.4297, + "step": 1841 + }, + { + "epoch": 0.25834502103786816, + "grad_norm": 2.1157633929157216, + "learning_rate": 8.69396430365541e-06, + "loss": 0.3899, + "step": 1842 + }, + { + "epoch": 0.2584852734922861, + "grad_norm": 1.9184366498424226, + "learning_rate": 8.69243325341531e-06, + "loss": 0.3735, + "step": 1843 + }, + { + "epoch": 0.25862552594670407, + "grad_norm": 2.456632888109686, + "learning_rate": 8.690901441267409e-06, + "loss": 0.3914, + "step": 1844 + }, + { + "epoch": 0.258765778401122, + "grad_norm": 1.7217606965219199, + "learning_rate": 8.689368867527781e-06, + "loss": 0.4023, + "step": 1845 + }, + { + "epoch": 0.25890603085554, + "grad_norm": 1.9828561692355213, + "learning_rate": 8.687835532512662e-06, + "loss": 0.3669, + "step": 1846 + }, + { + "epoch": 0.25904628330995794, + "grad_norm": 2.5495032107057494, + "learning_rate": 8.686301436538446e-06, + "loss": 0.3849, + "step": 1847 + }, + { + "epoch": 0.2591865357643759, + "grad_norm": 1.8525053432454466, + "learning_rate": 8.684766579921684e-06, + "loss": 0.387, + "step": 1848 + }, + { + "epoch": 0.25932678821879385, + "grad_norm": 1.740659171681587, + "learning_rate": 8.683230962979082e-06, + "loss": 0.3819, + "step": 1849 + }, + { + "epoch": 0.2594670406732118, + "grad_norm": 2.3032506569443543, + "learning_rate": 8.681694586027506e-06, + "loss": 0.3923, + "step": 1850 + }, + { + "epoch": 0.25960729312762976, + "grad_norm": 1.754521659154257, + "learning_rate": 8.68015744938397e-06, + "loss": 0.3646, + "step": 1851 + }, + { + "epoch": 0.2597475455820477, + "grad_norm": 2.0436054936921844, + "learning_rate": 8.67861955336566e-06, + "loss": 0.3578, + "step": 1852 + }, + { + "epoch": 0.2598877980364656, + "grad_norm": 1.640286074552872, + "learning_rate": 8.677080898289903e-06, + "loss": 0.3667, + "step": 1853 + }, + { + "epoch": 0.26002805049088357, + "grad_norm": 2.330152000413651, + "learning_rate": 8.675541484474195e-06, + "loss": 0.4138, + "step": 1854 + }, + { + "epoch": 0.2601683029453015, + "grad_norm": 2.5643937161399717, + "learning_rate": 8.67400131223618e-06, + "loss": 0.4348, + "step": 1855 + }, + { + "epoch": 0.2603085553997195, + "grad_norm": 2.342728964608888, + "learning_rate": 8.672460381893662e-06, + "loss": 0.4137, + "step": 1856 + }, + { + "epoch": 0.26044880785413743, + "grad_norm": 2.4091909692049387, + "learning_rate": 8.670918693764603e-06, + "loss": 0.4115, + "step": 1857 + }, + { + "epoch": 0.2605890603085554, + "grad_norm": 1.9344513195555226, + "learning_rate": 8.669376248167118e-06, + "loss": 0.3515, + "step": 1858 + }, + { + "epoch": 0.26072931276297334, + "grad_norm": 2.2932831582783164, + "learning_rate": 8.667833045419483e-06, + "loss": 0.4041, + "step": 1859 + }, + { + "epoch": 0.2608695652173913, + "grad_norm": 1.8623588406111193, + "learning_rate": 8.666289085840122e-06, + "loss": 0.3698, + "step": 1860 + }, + { + "epoch": 0.26100981767180925, + "grad_norm": 1.697984900790457, + "learning_rate": 8.664744369747622e-06, + "loss": 0.3902, + "step": 1861 + }, + { + "epoch": 0.2611500701262272, + "grad_norm": 2.102819471003304, + "learning_rate": 8.663198897460727e-06, + "loss": 0.4029, + "step": 1862 + }, + { + "epoch": 0.26129032258064516, + "grad_norm": 2.098543856631733, + "learning_rate": 8.661652669298332e-06, + "loss": 0.4179, + "step": 1863 + }, + { + "epoch": 0.2614305750350631, + "grad_norm": 2.1064507453859007, + "learning_rate": 8.660105685579493e-06, + "loss": 0.3965, + "step": 1864 + }, + { + "epoch": 0.2615708274894811, + "grad_norm": 2.968723337154117, + "learning_rate": 8.658557946623416e-06, + "loss": 0.3631, + "step": 1865 + }, + { + "epoch": 0.261711079943899, + "grad_norm": 2.1515177891780626, + "learning_rate": 8.657009452749466e-06, + "loss": 0.3782, + "step": 1866 + }, + { + "epoch": 0.261851332398317, + "grad_norm": 3.7319835203814775, + "learning_rate": 8.655460204277167e-06, + "loss": 0.4801, + "step": 1867 + }, + { + "epoch": 0.26199158485273494, + "grad_norm": 1.722021422763533, + "learning_rate": 8.653910201526195e-06, + "loss": 0.4096, + "step": 1868 + }, + { + "epoch": 0.2621318373071529, + "grad_norm": 2.2192863568259393, + "learning_rate": 8.652359444816379e-06, + "loss": 0.3775, + "step": 1869 + }, + { + "epoch": 0.26227208976157085, + "grad_norm": 1.970113978114331, + "learning_rate": 8.65080793446771e-06, + "loss": 0.4222, + "step": 1870 + }, + { + "epoch": 0.2624123422159888, + "grad_norm": 2.2785623473359298, + "learning_rate": 8.649255670800328e-06, + "loss": 0.3658, + "step": 1871 + }, + { + "epoch": 0.26255259467040676, + "grad_norm": 1.9152123559086334, + "learning_rate": 8.647702654134535e-06, + "loss": 0.4157, + "step": 1872 + }, + { + "epoch": 0.26269284712482466, + "grad_norm": 2.0864722320149585, + "learning_rate": 8.646148884790786e-06, + "loss": 0.4045, + "step": 1873 + }, + { + "epoch": 0.2628330995792426, + "grad_norm": 1.9043973456067995, + "learning_rate": 8.644594363089687e-06, + "loss": 0.4008, + "step": 1874 + }, + { + "epoch": 0.26297335203366057, + "grad_norm": 2.779080691478046, + "learning_rate": 8.643039089352005e-06, + "loss": 0.3749, + "step": 1875 + }, + { + "epoch": 0.2631136044880785, + "grad_norm": 2.770364181458188, + "learning_rate": 8.64148306389866e-06, + "loss": 0.3922, + "step": 1876 + }, + { + "epoch": 0.2632538569424965, + "grad_norm": 2.091318341111565, + "learning_rate": 8.639926287050726e-06, + "loss": 0.411, + "step": 1877 + }, + { + "epoch": 0.26339410939691443, + "grad_norm": 2.357830729581564, + "learning_rate": 8.638368759129433e-06, + "loss": 0.4514, + "step": 1878 + }, + { + "epoch": 0.2635343618513324, + "grad_norm": 2.0686976630744565, + "learning_rate": 8.636810480456165e-06, + "loss": 0.4019, + "step": 1879 + }, + { + "epoch": 0.26367461430575034, + "grad_norm": 2.765358432487882, + "learning_rate": 8.635251451352463e-06, + "loss": 0.3982, + "step": 1880 + }, + { + "epoch": 0.2638148667601683, + "grad_norm": 2.245786961045472, + "learning_rate": 8.633691672140022e-06, + "loss": 0.373, + "step": 1881 + }, + { + "epoch": 0.26395511921458625, + "grad_norm": 2.5532891882557145, + "learning_rate": 8.632131143140694e-06, + "loss": 0.3958, + "step": 1882 + }, + { + "epoch": 0.2640953716690042, + "grad_norm": 2.2262751879115914, + "learning_rate": 8.63056986467648e-06, + "loss": 0.3719, + "step": 1883 + }, + { + "epoch": 0.26423562412342216, + "grad_norm": 5.823579790256705, + "learning_rate": 8.629007837069537e-06, + "loss": 0.4001, + "step": 1884 + }, + { + "epoch": 0.2643758765778401, + "grad_norm": 2.0944670403046124, + "learning_rate": 8.627445060642182e-06, + "loss": 0.3603, + "step": 1885 + }, + { + "epoch": 0.2645161290322581, + "grad_norm": 2.336050177570873, + "learning_rate": 8.625881535716883e-06, + "loss": 0.4023, + "step": 1886 + }, + { + "epoch": 0.26465638148667603, + "grad_norm": 1.5570389490468364, + "learning_rate": 8.624317262616261e-06, + "loss": 0.4039, + "step": 1887 + }, + { + "epoch": 0.264796633941094, + "grad_norm": 2.610990926217625, + "learning_rate": 8.622752241663094e-06, + "loss": 0.4119, + "step": 1888 + }, + { + "epoch": 0.26493688639551194, + "grad_norm": 1.922161794213683, + "learning_rate": 8.621186473180312e-06, + "loss": 0.3569, + "step": 1889 + }, + { + "epoch": 0.2650771388499299, + "grad_norm": 1.79694260179815, + "learning_rate": 8.619619957491e-06, + "loss": 0.4201, + "step": 1890 + }, + { + "epoch": 0.26521739130434785, + "grad_norm": 2.6570526705436173, + "learning_rate": 8.6180526949184e-06, + "loss": 0.4005, + "step": 1891 + }, + { + "epoch": 0.2653576437587658, + "grad_norm": 2.0333176872818792, + "learning_rate": 8.616484685785905e-06, + "loss": 0.3845, + "step": 1892 + }, + { + "epoch": 0.2654978962131837, + "grad_norm": 2.3472710164833965, + "learning_rate": 8.614915930417058e-06, + "loss": 0.4486, + "step": 1893 + }, + { + "epoch": 0.26563814866760166, + "grad_norm": 2.113362591036528, + "learning_rate": 8.613346429135567e-06, + "loss": 0.3834, + "step": 1894 + }, + { + "epoch": 0.2657784011220196, + "grad_norm": 2.8160780293480343, + "learning_rate": 8.611776182265285e-06, + "loss": 0.4311, + "step": 1895 + }, + { + "epoch": 0.26591865357643757, + "grad_norm": 3.528094341729408, + "learning_rate": 8.610205190130223e-06, + "loss": 0.3854, + "step": 1896 + }, + { + "epoch": 0.2660589060308555, + "grad_norm": 2.3232927801934005, + "learning_rate": 8.608633453054541e-06, + "loss": 0.3791, + "step": 1897 + }, + { + "epoch": 0.2661991584852735, + "grad_norm": 1.9262337607578615, + "learning_rate": 8.607060971362557e-06, + "loss": 0.3894, + "step": 1898 + }, + { + "epoch": 0.26633941093969143, + "grad_norm": 1.8099675079717121, + "learning_rate": 8.605487745378745e-06, + "loss": 0.4007, + "step": 1899 + }, + { + "epoch": 0.2664796633941094, + "grad_norm": 2.195843181069265, + "learning_rate": 8.603913775427726e-06, + "loss": 0.3962, + "step": 1900 + }, + { + "epoch": 0.26661991584852734, + "grad_norm": 2.5239010789532843, + "learning_rate": 8.602339061834278e-06, + "loss": 0.3825, + "step": 1901 + }, + { + "epoch": 0.2667601683029453, + "grad_norm": 1.8738946199497548, + "learning_rate": 8.600763604923332e-06, + "loss": 0.4033, + "step": 1902 + }, + { + "epoch": 0.26690042075736325, + "grad_norm": 2.5849364956520606, + "learning_rate": 8.599187405019974e-06, + "loss": 0.4197, + "step": 1903 + }, + { + "epoch": 0.2670406732117812, + "grad_norm": 1.8922922648778524, + "learning_rate": 8.597610462449441e-06, + "loss": 0.4101, + "step": 1904 + }, + { + "epoch": 0.26718092566619916, + "grad_norm": 1.539743650328855, + "learning_rate": 8.596032777537124e-06, + "loss": 0.3873, + "step": 1905 + }, + { + "epoch": 0.2673211781206171, + "grad_norm": 1.5646290578988398, + "learning_rate": 8.594454350608565e-06, + "loss": 0.3506, + "step": 1906 + }, + { + "epoch": 0.2674614305750351, + "grad_norm": 2.002680889060512, + "learning_rate": 8.592875181989466e-06, + "loss": 0.4104, + "step": 1907 + }, + { + "epoch": 0.26760168302945303, + "grad_norm": 1.8974059013988245, + "learning_rate": 8.591295272005674e-06, + "loss": 0.419, + "step": 1908 + }, + { + "epoch": 0.267741935483871, + "grad_norm": 2.5530264117514223, + "learning_rate": 8.589714620983195e-06, + "loss": 0.4579, + "step": 1909 + }, + { + "epoch": 0.26788218793828894, + "grad_norm": 2.8885530211350114, + "learning_rate": 8.588133229248182e-06, + "loss": 0.3681, + "step": 1910 + }, + { + "epoch": 0.2680224403927069, + "grad_norm": 1.921906323681632, + "learning_rate": 8.586551097126945e-06, + "loss": 0.3724, + "step": 1911 + }, + { + "epoch": 0.26816269284712485, + "grad_norm": 2.3039845574576705, + "learning_rate": 8.58496822494595e-06, + "loss": 0.394, + "step": 1912 + }, + { + "epoch": 0.26830294530154275, + "grad_norm": 2.4581419552379047, + "learning_rate": 8.583384613031804e-06, + "loss": 0.3755, + "step": 1913 + }, + { + "epoch": 0.2684431977559607, + "grad_norm": 2.03704652478998, + "learning_rate": 8.581800261711281e-06, + "loss": 0.3682, + "step": 1914 + }, + { + "epoch": 0.26858345021037866, + "grad_norm": 2.230393283188611, + "learning_rate": 8.5802151713113e-06, + "loss": 0.3738, + "step": 1915 + }, + { + "epoch": 0.2687237026647966, + "grad_norm": 2.844086344184844, + "learning_rate": 8.578629342158929e-06, + "loss": 0.387, + "step": 1916 + }, + { + "epoch": 0.26886395511921457, + "grad_norm": 4.858322569288278, + "learning_rate": 8.577042774581397e-06, + "loss": 0.4563, + "step": 1917 + }, + { + "epoch": 0.2690042075736325, + "grad_norm": 2.102151781019419, + "learning_rate": 8.57545546890608e-06, + "loss": 0.3915, + "step": 1918 + }, + { + "epoch": 0.2691444600280505, + "grad_norm": 2.125545543878169, + "learning_rate": 8.573867425460506e-06, + "loss": 0.422, + "step": 1919 + }, + { + "epoch": 0.26928471248246844, + "grad_norm": 1.9576893687820685, + "learning_rate": 8.572278644572358e-06, + "loss": 0.3709, + "step": 1920 + }, + { + "epoch": 0.2694249649368864, + "grad_norm": 2.8934073589307046, + "learning_rate": 8.57068912656947e-06, + "loss": 0.4329, + "step": 1921 + }, + { + "epoch": 0.26956521739130435, + "grad_norm": 3.0953975105454576, + "learning_rate": 8.569098871779828e-06, + "loss": 0.3901, + "step": 1922 + }, + { + "epoch": 0.2697054698457223, + "grad_norm": 2.26550734641981, + "learning_rate": 8.567507880531567e-06, + "loss": 0.45, + "step": 1923 + }, + { + "epoch": 0.26984572230014026, + "grad_norm": 2.302109236305386, + "learning_rate": 8.565916153152982e-06, + "loss": 0.3977, + "step": 1924 + }, + { + "epoch": 0.2699859747545582, + "grad_norm": 2.491833317859881, + "learning_rate": 8.564323689972512e-06, + "loss": 0.3613, + "step": 1925 + }, + { + "epoch": 0.27012622720897617, + "grad_norm": 3.1616424724783947, + "learning_rate": 8.56273049131875e-06, + "loss": 0.3399, + "step": 1926 + }, + { + "epoch": 0.2702664796633941, + "grad_norm": 2.516709846058792, + "learning_rate": 8.561136557520444e-06, + "loss": 0.3759, + "step": 1927 + }, + { + "epoch": 0.2704067321178121, + "grad_norm": 3.215223287656667, + "learning_rate": 8.559541888906486e-06, + "loss": 0.4654, + "step": 1928 + }, + { + "epoch": 0.27054698457223003, + "grad_norm": 1.869643571207829, + "learning_rate": 8.557946485805932e-06, + "loss": 0.3496, + "step": 1929 + }, + { + "epoch": 0.270687237026648, + "grad_norm": 2.191051107586194, + "learning_rate": 8.556350348547978e-06, + "loss": 0.393, + "step": 1930 + }, + { + "epoch": 0.27082748948106594, + "grad_norm": 1.9275542912339192, + "learning_rate": 8.554753477461972e-06, + "loss": 0.3902, + "step": 1931 + }, + { + "epoch": 0.2709677419354839, + "grad_norm": 1.7725834071196545, + "learning_rate": 8.553155872877425e-06, + "loss": 0.3906, + "step": 1932 + }, + { + "epoch": 0.2711079943899018, + "grad_norm": 2.804797412131901, + "learning_rate": 8.551557535123988e-06, + "loss": 0.438, + "step": 1933 + }, + { + "epoch": 0.27124824684431975, + "grad_norm": 1.8478280566222092, + "learning_rate": 8.549958464531465e-06, + "loss": 0.3667, + "step": 1934 + }, + { + "epoch": 0.2713884992987377, + "grad_norm": 1.9476510205066677, + "learning_rate": 8.548358661429817e-06, + "loss": 0.3832, + "step": 1935 + }, + { + "epoch": 0.27152875175315566, + "grad_norm": 2.193762180914032, + "learning_rate": 8.546758126149148e-06, + "loss": 0.403, + "step": 1936 + }, + { + "epoch": 0.2716690042075736, + "grad_norm": 2.686763396340027, + "learning_rate": 8.545156859019721e-06, + "loss": 0.357, + "step": 1937 + }, + { + "epoch": 0.27180925666199157, + "grad_norm": 2.4497636252837096, + "learning_rate": 8.543554860371942e-06, + "loss": 0.4097, + "step": 1938 + }, + { + "epoch": 0.2719495091164095, + "grad_norm": 2.2659394144711174, + "learning_rate": 8.541952130536377e-06, + "loss": 0.4214, + "step": 1939 + }, + { + "epoch": 0.2720897615708275, + "grad_norm": 2.116612317546439, + "learning_rate": 8.540348669843736e-06, + "loss": 0.3638, + "step": 1940 + }, + { + "epoch": 0.27223001402524544, + "grad_norm": 2.367573319521346, + "learning_rate": 8.538744478624883e-06, + "loss": 0.3828, + "step": 1941 + }, + { + "epoch": 0.2723702664796634, + "grad_norm": 1.912587122709218, + "learning_rate": 8.537139557210828e-06, + "loss": 0.3923, + "step": 1942 + }, + { + "epoch": 0.27251051893408135, + "grad_norm": 2.6533771911747186, + "learning_rate": 8.535533905932739e-06, + "loss": 0.3634, + "step": 1943 + }, + { + "epoch": 0.2726507713884993, + "grad_norm": 2.6608145595246246, + "learning_rate": 8.533927525121928e-06, + "loss": 0.3968, + "step": 1944 + }, + { + "epoch": 0.27279102384291726, + "grad_norm": 2.6831855741141495, + "learning_rate": 8.532320415109864e-06, + "loss": 0.4027, + "step": 1945 + }, + { + "epoch": 0.2729312762973352, + "grad_norm": 2.1465467976734285, + "learning_rate": 8.53071257622816e-06, + "loss": 0.3948, + "step": 1946 + }, + { + "epoch": 0.27307152875175317, + "grad_norm": 2.184859507777465, + "learning_rate": 8.529104008808584e-06, + "loss": 0.4376, + "step": 1947 + }, + { + "epoch": 0.2732117812061711, + "grad_norm": 2.385611569189652, + "learning_rate": 8.527494713183052e-06, + "loss": 0.3855, + "step": 1948 + }, + { + "epoch": 0.2733520336605891, + "grad_norm": 5.639806122660272, + "learning_rate": 8.525884689683632e-06, + "loss": 0.4211, + "step": 1949 + }, + { + "epoch": 0.27349228611500703, + "grad_norm": 2.061708356188572, + "learning_rate": 8.524273938642539e-06, + "loss": 0.3728, + "step": 1950 + }, + { + "epoch": 0.273632538569425, + "grad_norm": 3.2571343097957044, + "learning_rate": 8.522662460392141e-06, + "loss": 0.3731, + "step": 1951 + }, + { + "epoch": 0.27377279102384294, + "grad_norm": 5.704271008039003, + "learning_rate": 8.521050255264956e-06, + "loss": 0.412, + "step": 1952 + }, + { + "epoch": 0.27391304347826084, + "grad_norm": 2.0338436294132913, + "learning_rate": 8.51943732359365e-06, + "loss": 0.3768, + "step": 1953 + }, + { + "epoch": 0.2740532959326788, + "grad_norm": 2.2392067052879123, + "learning_rate": 8.517823665711043e-06, + "loss": 0.4045, + "step": 1954 + }, + { + "epoch": 0.27419354838709675, + "grad_norm": 1.8752295035729778, + "learning_rate": 8.516209281950098e-06, + "loss": 0.3631, + "step": 1955 + }, + { + "epoch": 0.2743338008415147, + "grad_norm": 1.9185941651523495, + "learning_rate": 8.514594172643934e-06, + "loss": 0.3905, + "step": 1956 + }, + { + "epoch": 0.27447405329593266, + "grad_norm": 2.0808382606473472, + "learning_rate": 8.512978338125818e-06, + "loss": 0.3623, + "step": 1957 + }, + { + "epoch": 0.2746143057503506, + "grad_norm": 2.179664420275715, + "learning_rate": 8.511361778729165e-06, + "loss": 0.3849, + "step": 1958 + }, + { + "epoch": 0.2747545582047686, + "grad_norm": 2.319733621333297, + "learning_rate": 8.509744494787543e-06, + "loss": 0.3628, + "step": 1959 + }, + { + "epoch": 0.27489481065918653, + "grad_norm": 1.9976060964698326, + "learning_rate": 8.508126486634664e-06, + "loss": 0.3813, + "step": 1960 + }, + { + "epoch": 0.2750350631136045, + "grad_norm": 1.7363586766373231, + "learning_rate": 8.506507754604393e-06, + "loss": 0.4096, + "step": 1961 + }, + { + "epoch": 0.27517531556802244, + "grad_norm": 1.7032192799082124, + "learning_rate": 8.504888299030748e-06, + "loss": 0.4375, + "step": 1962 + }, + { + "epoch": 0.2753155680224404, + "grad_norm": 2.1166125428296088, + "learning_rate": 8.503268120247888e-06, + "loss": 0.3722, + "step": 1963 + }, + { + "epoch": 0.27545582047685835, + "grad_norm": 2.6186934508270143, + "learning_rate": 8.501647218590127e-06, + "loss": 0.3877, + "step": 1964 + }, + { + "epoch": 0.2755960729312763, + "grad_norm": 2.5298676660479384, + "learning_rate": 8.500025594391927e-06, + "loss": 0.4027, + "step": 1965 + }, + { + "epoch": 0.27573632538569426, + "grad_norm": 2.400180344958265, + "learning_rate": 8.498403247987899e-06, + "loss": 0.3818, + "step": 1966 + }, + { + "epoch": 0.2758765778401122, + "grad_norm": 1.9054596309905054, + "learning_rate": 8.496780179712804e-06, + "loss": 0.3779, + "step": 1967 + }, + { + "epoch": 0.27601683029453017, + "grad_norm": 2.9199983006302577, + "learning_rate": 8.495156389901548e-06, + "loss": 0.3671, + "step": 1968 + }, + { + "epoch": 0.2761570827489481, + "grad_norm": 2.0482238518447207, + "learning_rate": 8.49353187888919e-06, + "loss": 0.3613, + "step": 1969 + }, + { + "epoch": 0.2762973352033661, + "grad_norm": 4.292017406958045, + "learning_rate": 8.491906647010937e-06, + "loss": 0.3514, + "step": 1970 + }, + { + "epoch": 0.27643758765778403, + "grad_norm": 2.3651074628226647, + "learning_rate": 8.490280694602142e-06, + "loss": 0.3675, + "step": 1971 + }, + { + "epoch": 0.276577840112202, + "grad_norm": 2.248692450473476, + "learning_rate": 8.488654021998313e-06, + "loss": 0.3944, + "step": 1972 + }, + { + "epoch": 0.2767180925666199, + "grad_norm": 2.194272918543235, + "learning_rate": 8.4870266295351e-06, + "loss": 0.415, + "step": 1973 + }, + { + "epoch": 0.27685834502103784, + "grad_norm": 2.5410939702264534, + "learning_rate": 8.485398517548303e-06, + "loss": 0.4024, + "step": 1974 + }, + { + "epoch": 0.2769985974754558, + "grad_norm": 1.7951171237487749, + "learning_rate": 8.483769686373872e-06, + "loss": 0.3656, + "step": 1975 + }, + { + "epoch": 0.27713884992987375, + "grad_norm": 2.072755804162117, + "learning_rate": 8.482140136347907e-06, + "loss": 0.3525, + "step": 1976 + }, + { + "epoch": 0.2772791023842917, + "grad_norm": 2.1762294889707223, + "learning_rate": 8.480509867806655e-06, + "loss": 0.4684, + "step": 1977 + }, + { + "epoch": 0.27741935483870966, + "grad_norm": 1.84907254864733, + "learning_rate": 8.478878881086505e-06, + "loss": 0.373, + "step": 1978 + }, + { + "epoch": 0.2775596072931276, + "grad_norm": 3.078471517664375, + "learning_rate": 8.477247176524007e-06, + "loss": 0.3638, + "step": 1979 + }, + { + "epoch": 0.2776998597475456, + "grad_norm": 2.149284722866258, + "learning_rate": 8.475614754455845e-06, + "loss": 0.4024, + "step": 1980 + }, + { + "epoch": 0.27784011220196353, + "grad_norm": 1.661085375916414, + "learning_rate": 8.473981615218863e-06, + "loss": 0.3863, + "step": 1981 + }, + { + "epoch": 0.2779803646563815, + "grad_norm": 2.911253528977741, + "learning_rate": 8.472347759150044e-06, + "loss": 0.4179, + "step": 1982 + }, + { + "epoch": 0.27812061711079944, + "grad_norm": 2.5618475106545042, + "learning_rate": 8.470713186586526e-06, + "loss": 0.3999, + "step": 1983 + }, + { + "epoch": 0.2782608695652174, + "grad_norm": 2.0787290684695052, + "learning_rate": 8.46907789786559e-06, + "loss": 0.3664, + "step": 1984 + }, + { + "epoch": 0.27840112201963535, + "grad_norm": 2.2599584815338614, + "learning_rate": 8.467441893324667e-06, + "loss": 0.3781, + "step": 1985 + }, + { + "epoch": 0.2785413744740533, + "grad_norm": 3.101759897095279, + "learning_rate": 8.465805173301333e-06, + "loss": 0.394, + "step": 1986 + }, + { + "epoch": 0.27868162692847126, + "grad_norm": 2.403749529143717, + "learning_rate": 8.464167738133317e-06, + "loss": 0.4104, + "step": 1987 + }, + { + "epoch": 0.2788218793828892, + "grad_norm": 2.042999209000056, + "learning_rate": 8.462529588158491e-06, + "loss": 0.3861, + "step": 1988 + }, + { + "epoch": 0.27896213183730717, + "grad_norm": 2.122307923413435, + "learning_rate": 8.460890723714874e-06, + "loss": 0.3736, + "step": 1989 + }, + { + "epoch": 0.2791023842917251, + "grad_norm": 1.6328002345158334, + "learning_rate": 8.459251145140639e-06, + "loss": 0.4034, + "step": 1990 + }, + { + "epoch": 0.2792426367461431, + "grad_norm": 2.0913696422657617, + "learning_rate": 8.457610852774097e-06, + "loss": 0.3582, + "step": 1991 + }, + { + "epoch": 0.27938288920056104, + "grad_norm": 1.8425595594533861, + "learning_rate": 8.455969846953711e-06, + "loss": 0.4194, + "step": 1992 + }, + { + "epoch": 0.27952314165497893, + "grad_norm": 2.6940249698787273, + "learning_rate": 8.454328128018093e-06, + "loss": 0.385, + "step": 1993 + }, + { + "epoch": 0.2796633941093969, + "grad_norm": 5.380494436637811, + "learning_rate": 8.452685696306e-06, + "loss": 0.3974, + "step": 1994 + }, + { + "epoch": 0.27980364656381485, + "grad_norm": 1.8393183968506612, + "learning_rate": 8.451042552156335e-06, + "loss": 0.4158, + "step": 1995 + }, + { + "epoch": 0.2799438990182328, + "grad_norm": 1.9801218304577122, + "learning_rate": 8.449398695908151e-06, + "loss": 0.3812, + "step": 1996 + }, + { + "epoch": 0.28008415147265076, + "grad_norm": 1.7930305154504615, + "learning_rate": 8.447754127900645e-06, + "loss": 0.4415, + "step": 1997 + }, + { + "epoch": 0.2802244039270687, + "grad_norm": 1.9863592124764318, + "learning_rate": 8.446108848473165e-06, + "loss": 0.3819, + "step": 1998 + }, + { + "epoch": 0.28036465638148667, + "grad_norm": 2.4154400398141744, + "learning_rate": 8.444462857965198e-06, + "loss": 0.4205, + "step": 1999 + }, + { + "epoch": 0.2805049088359046, + "grad_norm": 3.0162779738531658, + "learning_rate": 8.442816156716386e-06, + "loss": 0.3709, + "step": 2000 + }, + { + "epoch": 0.2806451612903226, + "grad_norm": 2.124307519615917, + "learning_rate": 8.441168745066513e-06, + "loss": 0.3945, + "step": 2001 + }, + { + "epoch": 0.28078541374474053, + "grad_norm": 1.965962086021889, + "learning_rate": 8.439520623355513e-06, + "loss": 0.4628, + "step": 2002 + }, + { + "epoch": 0.2809256661991585, + "grad_norm": 1.94843430874463, + "learning_rate": 8.43787179192346e-06, + "loss": 0.402, + "step": 2003 + }, + { + "epoch": 0.28106591865357644, + "grad_norm": 1.6164321978455929, + "learning_rate": 8.436222251110584e-06, + "loss": 0.4205, + "step": 2004 + }, + { + "epoch": 0.2812061711079944, + "grad_norm": 2.874593457212774, + "learning_rate": 8.434572001257253e-06, + "loss": 0.3794, + "step": 2005 + }, + { + "epoch": 0.28134642356241235, + "grad_norm": 1.9116949269346306, + "learning_rate": 8.432921042703985e-06, + "loss": 0.3831, + "step": 2006 + }, + { + "epoch": 0.2814866760168303, + "grad_norm": 1.930630102326828, + "learning_rate": 8.431269375791444e-06, + "loss": 0.3987, + "step": 2007 + }, + { + "epoch": 0.28162692847124826, + "grad_norm": 2.465397163568844, + "learning_rate": 8.429617000860441e-06, + "loss": 0.4027, + "step": 2008 + }, + { + "epoch": 0.2817671809256662, + "grad_norm": 1.7839813706832324, + "learning_rate": 8.427963918251932e-06, + "loss": 0.3749, + "step": 2009 + }, + { + "epoch": 0.28190743338008417, + "grad_norm": 2.410140224508614, + "learning_rate": 8.426310128307016e-06, + "loss": 0.378, + "step": 2010 + }, + { + "epoch": 0.2820476858345021, + "grad_norm": 1.982791479072939, + "learning_rate": 8.424655631366945e-06, + "loss": 0.3891, + "step": 2011 + }, + { + "epoch": 0.2821879382889201, + "grad_norm": 1.6456976082123294, + "learning_rate": 8.42300042777311e-06, + "loss": 0.4031, + "step": 2012 + }, + { + "epoch": 0.282328190743338, + "grad_norm": 1.7083401408486298, + "learning_rate": 8.42134451786705e-06, + "loss": 0.3772, + "step": 2013 + }, + { + "epoch": 0.28246844319775594, + "grad_norm": 2.1972682291651235, + "learning_rate": 8.419687901990454e-06, + "loss": 0.371, + "step": 2014 + }, + { + "epoch": 0.2826086956521739, + "grad_norm": 2.6676438945081284, + "learning_rate": 8.41803058048515e-06, + "loss": 0.409, + "step": 2015 + }, + { + "epoch": 0.28274894810659185, + "grad_norm": 2.1337667154150166, + "learning_rate": 8.416372553693118e-06, + "loss": 0.4004, + "step": 2016 + }, + { + "epoch": 0.2828892005610098, + "grad_norm": 2.1972137580683904, + "learning_rate": 8.414713821956477e-06, + "loss": 0.3757, + "step": 2017 + }, + { + "epoch": 0.28302945301542776, + "grad_norm": 1.8607969857182447, + "learning_rate": 8.413054385617495e-06, + "loss": 0.4189, + "step": 2018 + }, + { + "epoch": 0.2831697054698457, + "grad_norm": 3.6280329939867615, + "learning_rate": 8.411394245018589e-06, + "loss": 0.3783, + "step": 2019 + }, + { + "epoch": 0.28330995792426367, + "grad_norm": 2.1460758059889846, + "learning_rate": 8.409733400502311e-06, + "loss": 0.3876, + "step": 2020 + }, + { + "epoch": 0.2834502103786816, + "grad_norm": 2.0155866752352134, + "learning_rate": 8.40807185241137e-06, + "loss": 0.3831, + "step": 2021 + }, + { + "epoch": 0.2835904628330996, + "grad_norm": 1.9741713342682568, + "learning_rate": 8.406409601088612e-06, + "loss": 0.3919, + "step": 2022 + }, + { + "epoch": 0.28373071528751753, + "grad_norm": 2.0708475824962154, + "learning_rate": 8.404746646877033e-06, + "loss": 0.4318, + "step": 2023 + }, + { + "epoch": 0.2838709677419355, + "grad_norm": 2.147420746625381, + "learning_rate": 8.40308299011977e-06, + "loss": 0.4323, + "step": 2024 + }, + { + "epoch": 0.28401122019635344, + "grad_norm": 2.5912033815878286, + "learning_rate": 8.401418631160109e-06, + "loss": 0.3917, + "step": 2025 + }, + { + "epoch": 0.2841514726507714, + "grad_norm": 2.0905007728879967, + "learning_rate": 8.399753570341475e-06, + "loss": 0.3675, + "step": 2026 + }, + { + "epoch": 0.28429172510518935, + "grad_norm": 2.532844229798891, + "learning_rate": 8.398087808007447e-06, + "loss": 0.324, + "step": 2027 + }, + { + "epoch": 0.2844319775596073, + "grad_norm": 2.0360164179111884, + "learning_rate": 8.39642134450174e-06, + "loss": 0.429, + "step": 2028 + }, + { + "epoch": 0.28457223001402526, + "grad_norm": 1.9246541839703972, + "learning_rate": 8.394754180168218e-06, + "loss": 0.3939, + "step": 2029 + }, + { + "epoch": 0.2847124824684432, + "grad_norm": 2.034711382782424, + "learning_rate": 8.393086315350887e-06, + "loss": 0.368, + "step": 2030 + }, + { + "epoch": 0.2848527349228612, + "grad_norm": 1.8123260283012756, + "learning_rate": 8.391417750393903e-06, + "loss": 0.3866, + "step": 2031 + }, + { + "epoch": 0.28499298737727913, + "grad_norm": 2.9954588216066194, + "learning_rate": 8.38974848564156e-06, + "loss": 0.4104, + "step": 2032 + }, + { + "epoch": 0.285133239831697, + "grad_norm": 2.104818891404387, + "learning_rate": 8.388078521438299e-06, + "loss": 0.3898, + "step": 2033 + }, + { + "epoch": 0.285273492286115, + "grad_norm": 2.013439087311307, + "learning_rate": 8.386407858128707e-06, + "loss": 0.4037, + "step": 2034 + }, + { + "epoch": 0.28541374474053294, + "grad_norm": 2.5360782868609166, + "learning_rate": 8.38473649605751e-06, + "loss": 0.4272, + "step": 2035 + }, + { + "epoch": 0.2855539971949509, + "grad_norm": 1.9654114284233963, + "learning_rate": 8.383064435569587e-06, + "loss": 0.3522, + "step": 2036 + }, + { + "epoch": 0.28569424964936885, + "grad_norm": 2.695067109809957, + "learning_rate": 8.381391677009954e-06, + "loss": 0.3766, + "step": 2037 + }, + { + "epoch": 0.2858345021037868, + "grad_norm": 4.958714168679723, + "learning_rate": 8.379718220723772e-06, + "loss": 0.4305, + "step": 2038 + }, + { + "epoch": 0.28597475455820476, + "grad_norm": 2.2216902043020315, + "learning_rate": 8.378044067056348e-06, + "loss": 0.4076, + "step": 2039 + }, + { + "epoch": 0.2861150070126227, + "grad_norm": 3.0217510714755886, + "learning_rate": 8.376369216353132e-06, + "loss": 0.3769, + "step": 2040 + }, + { + "epoch": 0.28625525946704067, + "grad_norm": 2.4143684464932775, + "learning_rate": 8.374693668959717e-06, + "loss": 0.3704, + "step": 2041 + }, + { + "epoch": 0.2863955119214586, + "grad_norm": 2.8704516049537983, + "learning_rate": 8.373017425221841e-06, + "loss": 0.3753, + "step": 2042 + }, + { + "epoch": 0.2865357643758766, + "grad_norm": 2.452911562047353, + "learning_rate": 8.371340485485384e-06, + "loss": 0.4044, + "step": 2043 + }, + { + "epoch": 0.28667601683029453, + "grad_norm": 1.9048770900934886, + "learning_rate": 8.369662850096374e-06, + "loss": 0.4221, + "step": 2044 + }, + { + "epoch": 0.2868162692847125, + "grad_norm": 1.6405235622473011, + "learning_rate": 8.367984519400976e-06, + "loss": 0.3667, + "step": 2045 + }, + { + "epoch": 0.28695652173913044, + "grad_norm": 1.7648879681874103, + "learning_rate": 8.366305493745502e-06, + "loss": 0.3676, + "step": 2046 + }, + { + "epoch": 0.2870967741935484, + "grad_norm": 1.829144837315095, + "learning_rate": 8.36462577347641e-06, + "loss": 0.4257, + "step": 2047 + }, + { + "epoch": 0.28723702664796635, + "grad_norm": 2.2416230132474344, + "learning_rate": 8.362945358940295e-06, + "loss": 0.3743, + "step": 2048 + }, + { + "epoch": 0.2873772791023843, + "grad_norm": 1.9966126605432148, + "learning_rate": 8.361264250483903e-06, + "loss": 0.359, + "step": 2049 + }, + { + "epoch": 0.28751753155680226, + "grad_norm": 1.9181819261224626, + "learning_rate": 8.359582448454114e-06, + "loss": 0.3865, + "step": 2050 + }, + { + "epoch": 0.2876577840112202, + "grad_norm": 2.314355878758693, + "learning_rate": 8.357899953197959e-06, + "loss": 0.3974, + "step": 2051 + }, + { + "epoch": 0.2877980364656382, + "grad_norm": 3.978629723449816, + "learning_rate": 8.35621676506261e-06, + "loss": 0.4274, + "step": 2052 + }, + { + "epoch": 0.2879382889200561, + "grad_norm": 3.4541337278948343, + "learning_rate": 8.354532884395381e-06, + "loss": 0.4033, + "step": 2053 + }, + { + "epoch": 0.28807854137447403, + "grad_norm": 2.454124975794849, + "learning_rate": 8.352848311543726e-06, + "loss": 0.3851, + "step": 2054 + }, + { + "epoch": 0.288218793828892, + "grad_norm": 1.98882977127263, + "learning_rate": 8.351163046855246e-06, + "loss": 0.4103, + "step": 2055 + }, + { + "epoch": 0.28835904628330994, + "grad_norm": 2.0469990467018997, + "learning_rate": 8.349477090677686e-06, + "loss": 0.3498, + "step": 2056 + }, + { + "epoch": 0.2884992987377279, + "grad_norm": 1.9361363811415777, + "learning_rate": 8.34779044335893e-06, + "loss": 0.3742, + "step": 2057 + }, + { + "epoch": 0.28863955119214585, + "grad_norm": 1.6277148969495066, + "learning_rate": 8.346103105247004e-06, + "loss": 0.3689, + "step": 2058 + }, + { + "epoch": 0.2887798036465638, + "grad_norm": 3.7857220898982216, + "learning_rate": 8.34441507669008e-06, + "loss": 0.352, + "step": 2059 + }, + { + "epoch": 0.28892005610098176, + "grad_norm": 1.8252570977119367, + "learning_rate": 8.342726358036473e-06, + "loss": 0.3535, + "step": 2060 + }, + { + "epoch": 0.2890603085553997, + "grad_norm": 1.994422765606317, + "learning_rate": 8.341036949634633e-06, + "loss": 0.3991, + "step": 2061 + }, + { + "epoch": 0.28920056100981767, + "grad_norm": 2.1137938380193178, + "learning_rate": 8.339346851833163e-06, + "loss": 0.4185, + "step": 2062 + }, + { + "epoch": 0.2893408134642356, + "grad_norm": 2.6572276391949305, + "learning_rate": 8.337656064980801e-06, + "loss": 0.4079, + "step": 2063 + }, + { + "epoch": 0.2894810659186536, + "grad_norm": 2.529826387924585, + "learning_rate": 8.335964589426429e-06, + "loss": 0.3931, + "step": 2064 + }, + { + "epoch": 0.28962131837307153, + "grad_norm": 2.7859199835739554, + "learning_rate": 8.334272425519069e-06, + "loss": 0.4661, + "step": 2065 + }, + { + "epoch": 0.2897615708274895, + "grad_norm": 2.245219875009994, + "learning_rate": 8.33257957360789e-06, + "loss": 0.3807, + "step": 2066 + }, + { + "epoch": 0.28990182328190744, + "grad_norm": 1.846290777397913, + "learning_rate": 8.330886034042198e-06, + "loss": 0.3663, + "step": 2067 + }, + { + "epoch": 0.2900420757363254, + "grad_norm": 2.6281187513279423, + "learning_rate": 8.329191807171447e-06, + "loss": 0.3913, + "step": 2068 + }, + { + "epoch": 0.29018232819074335, + "grad_norm": 2.922661007507906, + "learning_rate": 8.327496893345223e-06, + "loss": 0.3935, + "step": 2069 + }, + { + "epoch": 0.2903225806451613, + "grad_norm": 1.9079077969431208, + "learning_rate": 8.325801292913265e-06, + "loss": 0.3935, + "step": 2070 + }, + { + "epoch": 0.29046283309957927, + "grad_norm": 3.1233806228993175, + "learning_rate": 8.324105006225444e-06, + "loss": 0.3871, + "step": 2071 + }, + { + "epoch": 0.2906030855539972, + "grad_norm": 2.8115247095071365, + "learning_rate": 8.32240803363178e-06, + "loss": 0.3786, + "step": 2072 + }, + { + "epoch": 0.2907433380084151, + "grad_norm": 2.0744786216459934, + "learning_rate": 8.320710375482432e-06, + "loss": 0.3892, + "step": 2073 + }, + { + "epoch": 0.2908835904628331, + "grad_norm": 2.0533894547104463, + "learning_rate": 8.319012032127698e-06, + "loss": 0.393, + "step": 2074 + }, + { + "epoch": 0.29102384291725103, + "grad_norm": 2.0383829321744065, + "learning_rate": 8.317313003918017e-06, + "loss": 0.3717, + "step": 2075 + }, + { + "epoch": 0.291164095371669, + "grad_norm": 2.0247831252017, + "learning_rate": 8.315613291203977e-06, + "loss": 0.4398, + "step": 2076 + }, + { + "epoch": 0.29130434782608694, + "grad_norm": 1.9283604556951024, + "learning_rate": 8.313912894336298e-06, + "loss": 0.3893, + "step": 2077 + }, + { + "epoch": 0.2914446002805049, + "grad_norm": 2.0797136014221835, + "learning_rate": 8.312211813665848e-06, + "loss": 0.3805, + "step": 2078 + }, + { + "epoch": 0.29158485273492285, + "grad_norm": 3.2149438668626296, + "learning_rate": 8.310510049543628e-06, + "loss": 0.3408, + "step": 2079 + }, + { + "epoch": 0.2917251051893408, + "grad_norm": 1.98311611102613, + "learning_rate": 8.30880760232079e-06, + "loss": 0.3638, + "step": 2080 + }, + { + "epoch": 0.29186535764375876, + "grad_norm": 2.448884153249969, + "learning_rate": 8.307104472348619e-06, + "loss": 0.3961, + "step": 2081 + }, + { + "epoch": 0.2920056100981767, + "grad_norm": 2.5653130512420512, + "learning_rate": 8.305400659978547e-06, + "loss": 0.4071, + "step": 2082 + }, + { + "epoch": 0.29214586255259467, + "grad_norm": 2.752057346295419, + "learning_rate": 8.303696165562141e-06, + "loss": 0.4087, + "step": 2083 + }, + { + "epoch": 0.2922861150070126, + "grad_norm": 2.0282528406417404, + "learning_rate": 8.301990989451114e-06, + "loss": 0.3831, + "step": 2084 + }, + { + "epoch": 0.2924263674614306, + "grad_norm": 1.9096606026929017, + "learning_rate": 8.300285131997315e-06, + "loss": 0.3793, + "step": 2085 + }, + { + "epoch": 0.29256661991584854, + "grad_norm": 1.9258908130360277, + "learning_rate": 8.298578593552737e-06, + "loss": 0.338, + "step": 2086 + }, + { + "epoch": 0.2927068723702665, + "grad_norm": 4.284687082622965, + "learning_rate": 8.296871374469511e-06, + "loss": 0.4034, + "step": 2087 + }, + { + "epoch": 0.29284712482468445, + "grad_norm": 2.629651444403835, + "learning_rate": 8.295163475099911e-06, + "loss": 0.3948, + "step": 2088 + }, + { + "epoch": 0.2929873772791024, + "grad_norm": 2.6515235587808332, + "learning_rate": 8.293454895796351e-06, + "loss": 0.3882, + "step": 2089 + }, + { + "epoch": 0.29312762973352036, + "grad_norm": 2.2166437171761464, + "learning_rate": 8.291745636911382e-06, + "loss": 0.3615, + "step": 2090 + }, + { + "epoch": 0.2932678821879383, + "grad_norm": 2.3002612007099756, + "learning_rate": 8.2900356987977e-06, + "loss": 0.3707, + "step": 2091 + }, + { + "epoch": 0.29340813464235627, + "grad_norm": 2.288243169913762, + "learning_rate": 8.288325081808134e-06, + "loss": 0.4152, + "step": 2092 + }, + { + "epoch": 0.29354838709677417, + "grad_norm": 1.7614318884773124, + "learning_rate": 8.286613786295666e-06, + "loss": 0.4022, + "step": 2093 + }, + { + "epoch": 0.2936886395511921, + "grad_norm": 1.6696173853202558, + "learning_rate": 8.284901812613403e-06, + "loss": 0.4109, + "step": 2094 + }, + { + "epoch": 0.2938288920056101, + "grad_norm": 4.319676054005422, + "learning_rate": 8.283189161114602e-06, + "loss": 0.4101, + "step": 2095 + }, + { + "epoch": 0.29396914446002803, + "grad_norm": 3.0241739327399584, + "learning_rate": 8.281475832152655e-06, + "loss": 0.3953, + "step": 2096 + }, + { + "epoch": 0.294109396914446, + "grad_norm": 2.3366791941496956, + "learning_rate": 8.279761826081096e-06, + "loss": 0.3794, + "step": 2097 + }, + { + "epoch": 0.29424964936886394, + "grad_norm": 2.1285467281718837, + "learning_rate": 8.2780471432536e-06, + "loss": 0.3705, + "step": 2098 + }, + { + "epoch": 0.2943899018232819, + "grad_norm": 1.898297214426797, + "learning_rate": 8.276331784023976e-06, + "loss": 0.3772, + "step": 2099 + }, + { + "epoch": 0.29453015427769985, + "grad_norm": 2.105764732717809, + "learning_rate": 8.27461574874618e-06, + "loss": 0.3903, + "step": 2100 + }, + { + "epoch": 0.2946704067321178, + "grad_norm": 2.126178414711259, + "learning_rate": 8.272899037774302e-06, + "loss": 0.412, + "step": 2101 + }, + { + "epoch": 0.29481065918653576, + "grad_norm": 4.847686349435665, + "learning_rate": 8.271181651462575e-06, + "loss": 0.3925, + "step": 2102 + }, + { + "epoch": 0.2949509116409537, + "grad_norm": 1.9651083796462991, + "learning_rate": 8.269463590165368e-06, + "loss": 0.3721, + "step": 2103 + }, + { + "epoch": 0.29509116409537167, + "grad_norm": 2.66404656995558, + "learning_rate": 8.26774485423719e-06, + "loss": 0.3814, + "step": 2104 + }, + { + "epoch": 0.2952314165497896, + "grad_norm": 2.0170938034221906, + "learning_rate": 8.266025444032694e-06, + "loss": 0.3834, + "step": 2105 + }, + { + "epoch": 0.2953716690042076, + "grad_norm": 2.699145082829306, + "learning_rate": 8.264305359906664e-06, + "loss": 0.3915, + "step": 2106 + }, + { + "epoch": 0.29551192145862554, + "grad_norm": 2.1751018675703127, + "learning_rate": 8.26258460221403e-06, + "loss": 0.3809, + "step": 2107 + }, + { + "epoch": 0.2956521739130435, + "grad_norm": 2.0102372427085955, + "learning_rate": 8.260863171309857e-06, + "loss": 0.3755, + "step": 2108 + }, + { + "epoch": 0.29579242636746145, + "grad_norm": 2.1862630344043756, + "learning_rate": 8.25914106754935e-06, + "loss": 0.3799, + "step": 2109 + }, + { + "epoch": 0.2959326788218794, + "grad_norm": 2.1306685966312675, + "learning_rate": 8.257418291287855e-06, + "loss": 0.3374, + "step": 2110 + }, + { + "epoch": 0.29607293127629736, + "grad_norm": 4.926736035785619, + "learning_rate": 8.255694842880854e-06, + "loss": 0.4177, + "step": 2111 + }, + { + "epoch": 0.2962131837307153, + "grad_norm": 3.0856561073742608, + "learning_rate": 8.253970722683968e-06, + "loss": 0.3788, + "step": 2112 + }, + { + "epoch": 0.2963534361851332, + "grad_norm": 1.8817922906807496, + "learning_rate": 8.252245931052958e-06, + "loss": 0.3877, + "step": 2113 + }, + { + "epoch": 0.29649368863955117, + "grad_norm": 7.795719772101657, + "learning_rate": 8.250520468343722e-06, + "loss": 0.3775, + "step": 2114 + }, + { + "epoch": 0.2966339410939691, + "grad_norm": 2.2941997284682962, + "learning_rate": 8.248794334912297e-06, + "loss": 0.3845, + "step": 2115 + }, + { + "epoch": 0.2967741935483871, + "grad_norm": 3.1427453318436727, + "learning_rate": 8.247067531114858e-06, + "loss": 0.3832, + "step": 2116 + }, + { + "epoch": 0.29691444600280503, + "grad_norm": 2.1658731866065994, + "learning_rate": 8.245340057307722e-06, + "loss": 0.4201, + "step": 2117 + }, + { + "epoch": 0.297054698457223, + "grad_norm": 2.1934600290656965, + "learning_rate": 8.243611913847337e-06, + "loss": 0.4464, + "step": 2118 + }, + { + "epoch": 0.29719495091164094, + "grad_norm": 1.7379775564239317, + "learning_rate": 8.241883101090296e-06, + "loss": 0.3596, + "step": 2119 + }, + { + "epoch": 0.2973352033660589, + "grad_norm": 2.020164641289784, + "learning_rate": 8.240153619393325e-06, + "loss": 0.3881, + "step": 2120 + }, + { + "epoch": 0.29747545582047685, + "grad_norm": 2.090539434988478, + "learning_rate": 8.238423469113294e-06, + "loss": 0.3836, + "step": 2121 + }, + { + "epoch": 0.2976157082748948, + "grad_norm": 1.844075578295086, + "learning_rate": 8.236692650607205e-06, + "loss": 0.3258, + "step": 2122 + }, + { + "epoch": 0.29775596072931276, + "grad_norm": 3.8265461059568042, + "learning_rate": 8.2349611642322e-06, + "loss": 0.4089, + "step": 2123 + }, + { + "epoch": 0.2978962131837307, + "grad_norm": 2.880623831737345, + "learning_rate": 8.233229010345561e-06, + "loss": 0.3996, + "step": 2124 + }, + { + "epoch": 0.2980364656381487, + "grad_norm": 3.206971775483844, + "learning_rate": 8.231496189304704e-06, + "loss": 0.4055, + "step": 2125 + }, + { + "epoch": 0.29817671809256663, + "grad_norm": 2.7135037195599483, + "learning_rate": 8.229762701467187e-06, + "loss": 0.3559, + "step": 2126 + }, + { + "epoch": 0.2983169705469846, + "grad_norm": 2.7865619295536117, + "learning_rate": 8.2280285471907e-06, + "loss": 0.4346, + "step": 2127 + }, + { + "epoch": 0.29845722300140254, + "grad_norm": 2.2144601748777024, + "learning_rate": 8.226293726833077e-06, + "loss": 0.4049, + "step": 2128 + }, + { + "epoch": 0.2985974754558205, + "grad_norm": 3.596369178661456, + "learning_rate": 8.224558240752282e-06, + "loss": 0.3634, + "step": 2129 + }, + { + "epoch": 0.29873772791023845, + "grad_norm": 2.2244717603122672, + "learning_rate": 8.222822089306423e-06, + "loss": 0.3657, + "step": 2130 + }, + { + "epoch": 0.2988779803646564, + "grad_norm": 2.0354943168558925, + "learning_rate": 8.221085272853743e-06, + "loss": 0.3231, + "step": 2131 + }, + { + "epoch": 0.29901823281907436, + "grad_norm": 2.221840009668665, + "learning_rate": 8.21934779175262e-06, + "loss": 0.4098, + "step": 2132 + }, + { + "epoch": 0.29915848527349226, + "grad_norm": 2.363376583982346, + "learning_rate": 8.217609646361574e-06, + "loss": 0.3773, + "step": 2133 + }, + { + "epoch": 0.2992987377279102, + "grad_norm": 2.9407561091546293, + "learning_rate": 8.215870837039258e-06, + "loss": 0.3736, + "step": 2134 + }, + { + "epoch": 0.29943899018232817, + "grad_norm": 2.092441890370004, + "learning_rate": 8.21413136414446e-06, + "loss": 0.4006, + "step": 2135 + }, + { + "epoch": 0.2995792426367461, + "grad_norm": 2.6833049946427825, + "learning_rate": 8.212391228036111e-06, + "loss": 0.417, + "step": 2136 + }, + { + "epoch": 0.2997194950911641, + "grad_norm": 2.937513635481983, + "learning_rate": 8.210650429073278e-06, + "loss": 0.3488, + "step": 2137 + }, + { + "epoch": 0.29985974754558203, + "grad_norm": 1.7572655399040793, + "learning_rate": 8.208908967615159e-06, + "loss": 0.3499, + "step": 2138 + }, + { + "epoch": 0.3, + "grad_norm": 2.4271575414312343, + "learning_rate": 8.207166844021093e-06, + "loss": 0.3656, + "step": 2139 + }, + { + "epoch": 0.30014025245441794, + "grad_norm": 1.9399610086828964, + "learning_rate": 8.205424058650557e-06, + "loss": 0.3795, + "step": 2140 + }, + { + "epoch": 0.3002805049088359, + "grad_norm": 2.1563743887143016, + "learning_rate": 8.203680611863161e-06, + "loss": 0.3787, + "step": 2141 + }, + { + "epoch": 0.30042075736325385, + "grad_norm": 1.9535460361614778, + "learning_rate": 8.201936504018653e-06, + "loss": 0.3215, + "step": 2142 + }, + { + "epoch": 0.3005610098176718, + "grad_norm": 2.0141089600501014, + "learning_rate": 8.200191735476918e-06, + "loss": 0.3664, + "step": 2143 + }, + { + "epoch": 0.30070126227208976, + "grad_norm": 2.088290826537911, + "learning_rate": 8.198446306597977e-06, + "loss": 0.3713, + "step": 2144 + }, + { + "epoch": 0.3008415147265077, + "grad_norm": 2.2860951638492764, + "learning_rate": 8.196700217741987e-06, + "loss": 0.3709, + "step": 2145 + }, + { + "epoch": 0.3009817671809257, + "grad_norm": 2.004121229749821, + "learning_rate": 8.19495346926924e-06, + "loss": 0.3616, + "step": 2146 + }, + { + "epoch": 0.30112201963534363, + "grad_norm": 2.6905026185385044, + "learning_rate": 8.193206061540167e-06, + "loss": 0.3639, + "step": 2147 + }, + { + "epoch": 0.3012622720897616, + "grad_norm": 2.6251891840018953, + "learning_rate": 8.191457994915334e-06, + "loss": 0.3954, + "step": 2148 + }, + { + "epoch": 0.30140252454417954, + "grad_norm": 3.0459803123514146, + "learning_rate": 8.18970926975544e-06, + "loss": 0.4426, + "step": 2149 + }, + { + "epoch": 0.3015427769985975, + "grad_norm": 2.8704062540507147, + "learning_rate": 8.187959886421322e-06, + "loss": 0.3841, + "step": 2150 + }, + { + "epoch": 0.30168302945301545, + "grad_norm": 1.7224156910232515, + "learning_rate": 8.186209845273954e-06, + "loss": 0.3872, + "step": 2151 + }, + { + "epoch": 0.3018232819074334, + "grad_norm": 1.9432145414583362, + "learning_rate": 8.184459146674447e-06, + "loss": 0.3739, + "step": 2152 + }, + { + "epoch": 0.30196353436185136, + "grad_norm": 1.9952429583804874, + "learning_rate": 8.182707790984043e-06, + "loss": 0.3958, + "step": 2153 + }, + { + "epoch": 0.30210378681626926, + "grad_norm": 2.0696880046481576, + "learning_rate": 8.180955778564122e-06, + "loss": 0.3964, + "step": 2154 + }, + { + "epoch": 0.3022440392706872, + "grad_norm": 1.6842580666232094, + "learning_rate": 8.1792031097762e-06, + "loss": 0.3527, + "step": 2155 + }, + { + "epoch": 0.30238429172510517, + "grad_norm": 2.6652135963082833, + "learning_rate": 8.177449784981927e-06, + "loss": 0.4207, + "step": 2156 + }, + { + "epoch": 0.3025245441795231, + "grad_norm": 1.9541580519930288, + "learning_rate": 8.175695804543093e-06, + "loss": 0.3816, + "step": 2157 + }, + { + "epoch": 0.3026647966339411, + "grad_norm": 1.9573398735060985, + "learning_rate": 8.173941168821615e-06, + "loss": 0.3649, + "step": 2158 + }, + { + "epoch": 0.30280504908835904, + "grad_norm": 1.8407808739765645, + "learning_rate": 8.172185878179553e-06, + "loss": 0.4146, + "step": 2159 + }, + { + "epoch": 0.302945301542777, + "grad_norm": 2.3113105395461866, + "learning_rate": 8.170429932979097e-06, + "loss": 0.398, + "step": 2160 + }, + { + "epoch": 0.30308555399719495, + "grad_norm": 3.030407660729817, + "learning_rate": 8.168673333582572e-06, + "loss": 0.3413, + "step": 2161 + }, + { + "epoch": 0.3032258064516129, + "grad_norm": 1.6785322607972446, + "learning_rate": 8.166916080352447e-06, + "loss": 0.3947, + "step": 2162 + }, + { + "epoch": 0.30336605890603086, + "grad_norm": 1.8131414133543353, + "learning_rate": 8.165158173651313e-06, + "loss": 0.4159, + "step": 2163 + }, + { + "epoch": 0.3035063113604488, + "grad_norm": 2.4342955041326495, + "learning_rate": 8.163399613841903e-06, + "loss": 0.4456, + "step": 2164 + }, + { + "epoch": 0.30364656381486677, + "grad_norm": 2.224569720550326, + "learning_rate": 8.161640401287084e-06, + "loss": 0.4068, + "step": 2165 + }, + { + "epoch": 0.3037868162692847, + "grad_norm": 1.893543607336071, + "learning_rate": 8.159880536349858e-06, + "loss": 0.3876, + "step": 2166 + }, + { + "epoch": 0.3039270687237027, + "grad_norm": 2.127696178101129, + "learning_rate": 8.15812001939336e-06, + "loss": 0.405, + "step": 2167 + }, + { + "epoch": 0.30406732117812063, + "grad_norm": 2.9861863316566577, + "learning_rate": 8.156358850780858e-06, + "loss": 0.342, + "step": 2168 + }, + { + "epoch": 0.3042075736325386, + "grad_norm": 2.193005636059241, + "learning_rate": 8.154597030875762e-06, + "loss": 0.433, + "step": 2169 + }, + { + "epoch": 0.30434782608695654, + "grad_norm": 2.3754694625250217, + "learning_rate": 8.152834560041607e-06, + "loss": 0.3775, + "step": 2170 + }, + { + "epoch": 0.3044880785413745, + "grad_norm": 2.673078477876989, + "learning_rate": 8.15107143864207e-06, + "loss": 0.3682, + "step": 2171 + }, + { + "epoch": 0.30462833099579245, + "grad_norm": 4.703118713983029, + "learning_rate": 8.149307667040954e-06, + "loss": 0.3452, + "step": 2172 + }, + { + "epoch": 0.3047685834502104, + "grad_norm": 2.277373796268076, + "learning_rate": 8.147543245602204e-06, + "loss": 0.4216, + "step": 2173 + }, + { + "epoch": 0.3049088359046283, + "grad_norm": 2.022083669091645, + "learning_rate": 8.145778174689897e-06, + "loss": 0.3599, + "step": 2174 + }, + { + "epoch": 0.30504908835904626, + "grad_norm": 2.2134787038188377, + "learning_rate": 8.144012454668241e-06, + "loss": 0.3841, + "step": 2175 + }, + { + "epoch": 0.3051893408134642, + "grad_norm": 2.2910215567977876, + "learning_rate": 8.142246085901581e-06, + "loss": 0.4187, + "step": 2176 + }, + { + "epoch": 0.30532959326788217, + "grad_norm": 2.451140457499104, + "learning_rate": 8.140479068754396e-06, + "loss": 0.3879, + "step": 2177 + }, + { + "epoch": 0.3054698457223001, + "grad_norm": 2.8276933372229385, + "learning_rate": 8.138711403591295e-06, + "loss": 0.3992, + "step": 2178 + }, + { + "epoch": 0.3056100981767181, + "grad_norm": 2.37082003124329, + "learning_rate": 8.136943090777025e-06, + "loss": 0.4225, + "step": 2179 + }, + { + "epoch": 0.30575035063113604, + "grad_norm": 2.1829502201529407, + "learning_rate": 8.135174130676464e-06, + "loss": 0.3544, + "step": 2180 + }, + { + "epoch": 0.305890603085554, + "grad_norm": 2.617295812386776, + "learning_rate": 8.133404523654626e-06, + "loss": 0.4068, + "step": 2181 + }, + { + "epoch": 0.30603085553997195, + "grad_norm": 2.0892847757488746, + "learning_rate": 8.131634270076657e-06, + "loss": 0.3957, + "step": 2182 + }, + { + "epoch": 0.3061711079943899, + "grad_norm": 2.2764629145868502, + "learning_rate": 8.129863370307833e-06, + "loss": 0.3807, + "step": 2183 + }, + { + "epoch": 0.30631136044880786, + "grad_norm": 2.8685846484698705, + "learning_rate": 8.128091824713571e-06, + "loss": 0.4161, + "step": 2184 + }, + { + "epoch": 0.3064516129032258, + "grad_norm": 2.1901891255692325, + "learning_rate": 8.126319633659416e-06, + "loss": 0.3775, + "step": 2185 + }, + { + "epoch": 0.30659186535764377, + "grad_norm": 2.3252623707387166, + "learning_rate": 8.124546797511046e-06, + "loss": 0.386, + "step": 2186 + }, + { + "epoch": 0.3067321178120617, + "grad_norm": 2.067826185222951, + "learning_rate": 8.122773316634276e-06, + "loss": 0.4171, + "step": 2187 + }, + { + "epoch": 0.3068723702664797, + "grad_norm": 2.724974248921923, + "learning_rate": 8.120999191395048e-06, + "loss": 0.4506, + "step": 2188 + }, + { + "epoch": 0.30701262272089763, + "grad_norm": 1.8373023621025466, + "learning_rate": 8.119224422159441e-06, + "loss": 0.3795, + "step": 2189 + }, + { + "epoch": 0.3071528751753156, + "grad_norm": 2.099796616832632, + "learning_rate": 8.117449009293668e-06, + "loss": 0.3593, + "step": 2190 + }, + { + "epoch": 0.30729312762973354, + "grad_norm": 2.260378320137247, + "learning_rate": 8.115672953164073e-06, + "loss": 0.3675, + "step": 2191 + }, + { + "epoch": 0.3074333800841515, + "grad_norm": 3.0535911680522076, + "learning_rate": 8.113896254137131e-06, + "loss": 0.4079, + "step": 2192 + }, + { + "epoch": 0.30757363253856945, + "grad_norm": 1.851037123944131, + "learning_rate": 8.112118912579452e-06, + "loss": 0.3492, + "step": 2193 + }, + { + "epoch": 0.30771388499298735, + "grad_norm": 2.0999790735564487, + "learning_rate": 8.110340928857779e-06, + "loss": 0.3966, + "step": 2194 + }, + { + "epoch": 0.3078541374474053, + "grad_norm": 2.2863134339690103, + "learning_rate": 8.108562303338987e-06, + "loss": 0.3676, + "step": 2195 + }, + { + "epoch": 0.30799438990182326, + "grad_norm": 1.616899252740245, + "learning_rate": 8.10678303639008e-06, + "loss": 0.3719, + "step": 2196 + }, + { + "epoch": 0.3081346423562412, + "grad_norm": 1.9026439516043294, + "learning_rate": 8.1050031283782e-06, + "loss": 0.3955, + "step": 2197 + }, + { + "epoch": 0.3082748948106592, + "grad_norm": 1.8766172428071146, + "learning_rate": 8.103222579670618e-06, + "loss": 0.3902, + "step": 2198 + }, + { + "epoch": 0.3084151472650771, + "grad_norm": 2.128621437549606, + "learning_rate": 8.101441390634736e-06, + "loss": 0.3842, + "step": 2199 + }, + { + "epoch": 0.3085553997194951, + "grad_norm": 1.5484493220519866, + "learning_rate": 8.099659561638092e-06, + "loss": 0.3689, + "step": 2200 + }, + { + "epoch": 0.30869565217391304, + "grad_norm": 2.0247993746614488, + "learning_rate": 8.097877093048354e-06, + "loss": 0.4129, + "step": 2201 + }, + { + "epoch": 0.308835904628331, + "grad_norm": 2.205177595464606, + "learning_rate": 8.096093985233323e-06, + "loss": 0.3824, + "step": 2202 + }, + { + "epoch": 0.30897615708274895, + "grad_norm": 2.5801998228123475, + "learning_rate": 8.094310238560926e-06, + "loss": 0.4089, + "step": 2203 + }, + { + "epoch": 0.3091164095371669, + "grad_norm": 3.9177762076473956, + "learning_rate": 8.092525853399231e-06, + "loss": 0.3898, + "step": 2204 + }, + { + "epoch": 0.30925666199158486, + "grad_norm": 3.272130591493574, + "learning_rate": 8.090740830116432e-06, + "loss": 0.4042, + "step": 2205 + }, + { + "epoch": 0.3093969144460028, + "grad_norm": 2.539198182222068, + "learning_rate": 8.088955169080856e-06, + "loss": 0.3859, + "step": 2206 + }, + { + "epoch": 0.30953716690042077, + "grad_norm": 1.7323134258262423, + "learning_rate": 8.087168870660964e-06, + "loss": 0.4205, + "step": 2207 + }, + { + "epoch": 0.3096774193548387, + "grad_norm": 2.1988005706367217, + "learning_rate": 8.085381935225342e-06, + "loss": 0.3865, + "step": 2208 + }, + { + "epoch": 0.3098176718092567, + "grad_norm": 2.3541928405206924, + "learning_rate": 8.083594363142717e-06, + "loss": 0.3139, + "step": 2209 + }, + { + "epoch": 0.30995792426367463, + "grad_norm": 2.485741197662744, + "learning_rate": 8.081806154781936e-06, + "loss": 0.4228, + "step": 2210 + }, + { + "epoch": 0.3100981767180926, + "grad_norm": 2.208808008197552, + "learning_rate": 8.080017310511987e-06, + "loss": 0.4026, + "step": 2211 + }, + { + "epoch": 0.31023842917251054, + "grad_norm": 1.7956333680427607, + "learning_rate": 8.078227830701985e-06, + "loss": 0.3569, + "step": 2212 + }, + { + "epoch": 0.3103786816269285, + "grad_norm": 2.288421541122139, + "learning_rate": 8.076437715721174e-06, + "loss": 0.4007, + "step": 2213 + }, + { + "epoch": 0.3105189340813464, + "grad_norm": 2.7860932773548823, + "learning_rate": 8.074646965938937e-06, + "loss": 0.4589, + "step": 2214 + }, + { + "epoch": 0.31065918653576435, + "grad_norm": 2.253996372900332, + "learning_rate": 8.072855581724778e-06, + "loss": 0.4302, + "step": 2215 + }, + { + "epoch": 0.3107994389901823, + "grad_norm": 1.9691610966734285, + "learning_rate": 8.071063563448341e-06, + "loss": 0.357, + "step": 2216 + }, + { + "epoch": 0.31093969144460026, + "grad_norm": 2.737957770685815, + "learning_rate": 8.06927091147939e-06, + "loss": 0.3727, + "step": 2217 + }, + { + "epoch": 0.3110799438990182, + "grad_norm": 1.9685142012576333, + "learning_rate": 8.067477626187831e-06, + "loss": 0.3663, + "step": 2218 + }, + { + "epoch": 0.3112201963534362, + "grad_norm": 2.0052910197330513, + "learning_rate": 8.065683707943696e-06, + "loss": 0.3606, + "step": 2219 + }, + { + "epoch": 0.31136044880785413, + "grad_norm": 2.5818417662838926, + "learning_rate": 8.063889157117148e-06, + "loss": 0.4069, + "step": 2220 + }, + { + "epoch": 0.3115007012622721, + "grad_norm": 1.9977260417778602, + "learning_rate": 8.062093974078478e-06, + "loss": 0.3314, + "step": 2221 + }, + { + "epoch": 0.31164095371669004, + "grad_norm": 1.8538974895181894, + "learning_rate": 8.060298159198107e-06, + "loss": 0.4012, + "step": 2222 + }, + { + "epoch": 0.311781206171108, + "grad_norm": 2.506972317762278, + "learning_rate": 8.058501712846594e-06, + "loss": 0.4327, + "step": 2223 + }, + { + "epoch": 0.31192145862552595, + "grad_norm": 2.1493888829241308, + "learning_rate": 8.056704635394621e-06, + "loss": 0.3717, + "step": 2224 + }, + { + "epoch": 0.3120617110799439, + "grad_norm": 2.0530731473323396, + "learning_rate": 8.054906927213e-06, + "loss": 0.3568, + "step": 2225 + }, + { + "epoch": 0.31220196353436186, + "grad_norm": 2.065786344427999, + "learning_rate": 8.05310858867268e-06, + "loss": 0.4181, + "step": 2226 + }, + { + "epoch": 0.3123422159887798, + "grad_norm": 2.300136922824164, + "learning_rate": 8.051309620144733e-06, + "loss": 0.3884, + "step": 2227 + }, + { + "epoch": 0.31248246844319777, + "grad_norm": 3.0152160683511178, + "learning_rate": 8.049510022000365e-06, + "loss": 0.4056, + "step": 2228 + }, + { + "epoch": 0.3126227208976157, + "grad_norm": 2.389163299689456, + "learning_rate": 8.047709794610907e-06, + "loss": 0.3686, + "step": 2229 + }, + { + "epoch": 0.3127629733520337, + "grad_norm": 3.321035364786867, + "learning_rate": 8.045908938347828e-06, + "loss": 0.3865, + "step": 2230 + }, + { + "epoch": 0.31290322580645163, + "grad_norm": 2.31740750421422, + "learning_rate": 8.04410745358272e-06, + "loss": 0.3923, + "step": 2231 + }, + { + "epoch": 0.3130434782608696, + "grad_norm": 2.8486348211982975, + "learning_rate": 8.042305340687307e-06, + "loss": 0.4217, + "step": 2232 + }, + { + "epoch": 0.31318373071528754, + "grad_norm": 2.1042025975347927, + "learning_rate": 8.040502600033441e-06, + "loss": 0.3795, + "step": 2233 + }, + { + "epoch": 0.31332398316970544, + "grad_norm": 2.285068503757385, + "learning_rate": 8.038699231993106e-06, + "loss": 0.3854, + "step": 2234 + }, + { + "epoch": 0.3134642356241234, + "grad_norm": 2.1754397769799563, + "learning_rate": 8.036895236938416e-06, + "loss": 0.3727, + "step": 2235 + }, + { + "epoch": 0.31360448807854135, + "grad_norm": 2.4114890594392433, + "learning_rate": 8.03509061524161e-06, + "loss": 0.3539, + "step": 2236 + }, + { + "epoch": 0.3137447405329593, + "grad_norm": 2.4342818902245975, + "learning_rate": 8.03328536727506e-06, + "loss": 0.4188, + "step": 2237 + }, + { + "epoch": 0.31388499298737726, + "grad_norm": 2.581267135600186, + "learning_rate": 8.031479493411268e-06, + "loss": 0.4197, + "step": 2238 + }, + { + "epoch": 0.3140252454417952, + "grad_norm": 2.1154362378790768, + "learning_rate": 8.029672994022861e-06, + "loss": 0.3966, + "step": 2239 + }, + { + "epoch": 0.3141654978962132, + "grad_norm": 2.1405316631824034, + "learning_rate": 8.027865869482599e-06, + "loss": 0.3859, + "step": 2240 + }, + { + "epoch": 0.31430575035063113, + "grad_norm": 2.423109072400933, + "learning_rate": 8.02605812016337e-06, + "loss": 0.3493, + "step": 2241 + }, + { + "epoch": 0.3144460028050491, + "grad_norm": 1.7392404031712216, + "learning_rate": 8.024249746438189e-06, + "loss": 0.3639, + "step": 2242 + }, + { + "epoch": 0.31458625525946704, + "grad_norm": 2.1875422882352202, + "learning_rate": 8.022440748680202e-06, + "loss": 0.3669, + "step": 2243 + }, + { + "epoch": 0.314726507713885, + "grad_norm": 1.991900793650069, + "learning_rate": 8.020631127262681e-06, + "loss": 0.3711, + "step": 2244 + }, + { + "epoch": 0.31486676016830295, + "grad_norm": 1.7238689833552583, + "learning_rate": 8.018820882559034e-06, + "loss": 0.3963, + "step": 2245 + }, + { + "epoch": 0.3150070126227209, + "grad_norm": 1.7066443941158096, + "learning_rate": 8.017010014942788e-06, + "loss": 0.3787, + "step": 2246 + }, + { + "epoch": 0.31514726507713886, + "grad_norm": 2.401744935372743, + "learning_rate": 8.015198524787603e-06, + "loss": 0.382, + "step": 2247 + }, + { + "epoch": 0.3152875175315568, + "grad_norm": 5.288497556550409, + "learning_rate": 8.013386412467268e-06, + "loss": 0.3587, + "step": 2248 + }, + { + "epoch": 0.31542776998597477, + "grad_norm": 1.9781105703879194, + "learning_rate": 8.0115736783557e-06, + "loss": 0.3705, + "step": 2249 + }, + { + "epoch": 0.3155680224403927, + "grad_norm": 2.7045237720281876, + "learning_rate": 8.009760322826945e-06, + "loss": 0.3735, + "step": 2250 + }, + { + "epoch": 0.3157082748948107, + "grad_norm": 2.23414664001861, + "learning_rate": 8.007946346255176e-06, + "loss": 0.4121, + "step": 2251 + }, + { + "epoch": 0.31584852734922864, + "grad_norm": 1.8496678311900048, + "learning_rate": 8.006131749014692e-06, + "loss": 0.4051, + "step": 2252 + }, + { + "epoch": 0.3159887798036466, + "grad_norm": 3.107255989984166, + "learning_rate": 8.004316531479924e-06, + "loss": 0.412, + "step": 2253 + }, + { + "epoch": 0.3161290322580645, + "grad_norm": 2.0664564389802367, + "learning_rate": 8.00250069402543e-06, + "loss": 0.3983, + "step": 2254 + }, + { + "epoch": 0.31626928471248245, + "grad_norm": 1.5957451094633568, + "learning_rate": 8.000684237025894e-06, + "loss": 0.3737, + "step": 2255 + }, + { + "epoch": 0.3164095371669004, + "grad_norm": 2.449441172646948, + "learning_rate": 7.998867160856133e-06, + "loss": 0.4479, + "step": 2256 + }, + { + "epoch": 0.31654978962131836, + "grad_norm": 3.034635403397775, + "learning_rate": 7.997049465891083e-06, + "loss": 0.3817, + "step": 2257 + }, + { + "epoch": 0.3166900420757363, + "grad_norm": 2.1371485192505832, + "learning_rate": 7.995231152505815e-06, + "loss": 0.4284, + "step": 2258 + }, + { + "epoch": 0.31683029453015427, + "grad_norm": 2.103018444247045, + "learning_rate": 7.993412221075525e-06, + "loss": 0.3598, + "step": 2259 + }, + { + "epoch": 0.3169705469845722, + "grad_norm": 2.0663157913801635, + "learning_rate": 7.991592671975536e-06, + "loss": 0.4007, + "step": 2260 + }, + { + "epoch": 0.3171107994389902, + "grad_norm": 4.100768152246181, + "learning_rate": 7.9897725055813e-06, + "loss": 0.3542, + "step": 2261 + }, + { + "epoch": 0.31725105189340813, + "grad_norm": 2.007098950602527, + "learning_rate": 7.987951722268399e-06, + "loss": 0.3738, + "step": 2262 + }, + { + "epoch": 0.3173913043478261, + "grad_norm": 2.696578195800087, + "learning_rate": 7.986130322412532e-06, + "loss": 0.3687, + "step": 2263 + }, + { + "epoch": 0.31753155680224404, + "grad_norm": 2.009144856753719, + "learning_rate": 7.984308306389536e-06, + "loss": 0.364, + "step": 2264 + }, + { + "epoch": 0.317671809256662, + "grad_norm": 4.901461262701729, + "learning_rate": 7.982485674575373e-06, + "loss": 0.373, + "step": 2265 + }, + { + "epoch": 0.31781206171107995, + "grad_norm": 1.7093962952230575, + "learning_rate": 7.980662427346127e-06, + "loss": 0.3988, + "step": 2266 + }, + { + "epoch": 0.3179523141654979, + "grad_norm": 2.874669760935338, + "learning_rate": 7.978838565078015e-06, + "loss": 0.3611, + "step": 2267 + }, + { + "epoch": 0.31809256661991586, + "grad_norm": 2.7640808449886296, + "learning_rate": 7.977014088147375e-06, + "loss": 0.4062, + "step": 2268 + }, + { + "epoch": 0.3182328190743338, + "grad_norm": 2.2714153092332197, + "learning_rate": 7.975188996930679e-06, + "loss": 0.3719, + "step": 2269 + }, + { + "epoch": 0.31837307152875177, + "grad_norm": 2.7586359194220056, + "learning_rate": 7.973363291804518e-06, + "loss": 0.4288, + "step": 2270 + }, + { + "epoch": 0.3185133239831697, + "grad_norm": 2.0566691426530688, + "learning_rate": 7.971536973145614e-06, + "loss": 0.4049, + "step": 2271 + }, + { + "epoch": 0.3186535764375877, + "grad_norm": 2.645378754580899, + "learning_rate": 7.96971004133082e-06, + "loss": 0.3915, + "step": 2272 + }, + { + "epoch": 0.31879382889200564, + "grad_norm": 2.3592871845399666, + "learning_rate": 7.967882496737106e-06, + "loss": 0.3707, + "step": 2273 + }, + { + "epoch": 0.31893408134642354, + "grad_norm": 2.4749599530611786, + "learning_rate": 7.966054339741573e-06, + "loss": 0.4039, + "step": 2274 + }, + { + "epoch": 0.3190743338008415, + "grad_norm": 1.8905120689775639, + "learning_rate": 7.96422557072145e-06, + "loss": 0.3938, + "step": 2275 + }, + { + "epoch": 0.31921458625525945, + "grad_norm": 2.676230466683171, + "learning_rate": 7.962396190054089e-06, + "loss": 0.3574, + "step": 2276 + }, + { + "epoch": 0.3193548387096774, + "grad_norm": 2.148687951488609, + "learning_rate": 7.960566198116973e-06, + "loss": 0.425, + "step": 2277 + }, + { + "epoch": 0.31949509116409536, + "grad_norm": 3.1232190970337275, + "learning_rate": 7.958735595287706e-06, + "loss": 0.345, + "step": 2278 + }, + { + "epoch": 0.3196353436185133, + "grad_norm": 2.2374484391904743, + "learning_rate": 7.95690438194402e-06, + "loss": 0.3673, + "step": 2279 + }, + { + "epoch": 0.31977559607293127, + "grad_norm": 2.1124972371641975, + "learning_rate": 7.955072558463772e-06, + "loss": 0.3808, + "step": 2280 + }, + { + "epoch": 0.3199158485273492, + "grad_norm": 2.0835221268623476, + "learning_rate": 7.953240125224948e-06, + "loss": 0.4209, + "step": 2281 + }, + { + "epoch": 0.3200561009817672, + "grad_norm": 2.007836961411668, + "learning_rate": 7.951407082605657e-06, + "loss": 0.365, + "step": 2282 + }, + { + "epoch": 0.32019635343618513, + "grad_norm": 2.4335629873030586, + "learning_rate": 7.949573430984137e-06, + "loss": 0.4528, + "step": 2283 + }, + { + "epoch": 0.3203366058906031, + "grad_norm": 2.7069682764872676, + "learning_rate": 7.947739170738744e-06, + "loss": 0.385, + "step": 2284 + }, + { + "epoch": 0.32047685834502104, + "grad_norm": 3.632064118773018, + "learning_rate": 7.945904302247968e-06, + "loss": 0.4091, + "step": 2285 + }, + { + "epoch": 0.320617110799439, + "grad_norm": 2.464407663111946, + "learning_rate": 7.944068825890424e-06, + "loss": 0.343, + "step": 2286 + }, + { + "epoch": 0.32075736325385695, + "grad_norm": 1.7060924890497633, + "learning_rate": 7.942232742044842e-06, + "loss": 0.3816, + "step": 2287 + }, + { + "epoch": 0.3208976157082749, + "grad_norm": 1.9806092342231936, + "learning_rate": 7.940396051090093e-06, + "loss": 0.3719, + "step": 2288 + }, + { + "epoch": 0.32103786816269286, + "grad_norm": 1.7567527458676493, + "learning_rate": 7.938558753405162e-06, + "loss": 0.4074, + "step": 2289 + }, + { + "epoch": 0.3211781206171108, + "grad_norm": 1.9619867321814028, + "learning_rate": 7.93672084936916e-06, + "loss": 0.4201, + "step": 2290 + }, + { + "epoch": 0.3213183730715288, + "grad_norm": 2.1708211194339726, + "learning_rate": 7.934882339361331e-06, + "loss": 0.3895, + "step": 2291 + }, + { + "epoch": 0.32145862552594673, + "grad_norm": 2.2776281790983575, + "learning_rate": 7.933043223761035e-06, + "loss": 0.3696, + "step": 2292 + }, + { + "epoch": 0.3215988779803647, + "grad_norm": 2.225626645086249, + "learning_rate": 7.931203502947762e-06, + "loss": 0.37, + "step": 2293 + }, + { + "epoch": 0.3217391304347826, + "grad_norm": 2.0233115374267334, + "learning_rate": 7.929363177301124e-06, + "loss": 0.3833, + "step": 2294 + }, + { + "epoch": 0.32187938288920054, + "grad_norm": 4.124759320260182, + "learning_rate": 7.927522247200864e-06, + "loss": 0.4233, + "step": 2295 + }, + { + "epoch": 0.3220196353436185, + "grad_norm": 1.6645099593483654, + "learning_rate": 7.925680713026837e-06, + "loss": 0.3589, + "step": 2296 + }, + { + "epoch": 0.32215988779803645, + "grad_norm": 2.6125153244518216, + "learning_rate": 7.923838575159038e-06, + "loss": 0.4116, + "step": 2297 + }, + { + "epoch": 0.3223001402524544, + "grad_norm": 2.5757111317396366, + "learning_rate": 7.921995833977575e-06, + "loss": 0.3671, + "step": 2298 + }, + { + "epoch": 0.32244039270687236, + "grad_norm": 2.8395032088202314, + "learning_rate": 7.920152489862687e-06, + "loss": 0.4082, + "step": 2299 + }, + { + "epoch": 0.3225806451612903, + "grad_norm": 2.3124470060307725, + "learning_rate": 7.918308543194735e-06, + "loss": 0.3825, + "step": 2300 + }, + { + "epoch": 0.32272089761570827, + "grad_norm": 6.590749916857166, + "learning_rate": 7.916463994354203e-06, + "loss": 0.3883, + "step": 2301 + }, + { + "epoch": 0.3228611500701262, + "grad_norm": 2.096803857815951, + "learning_rate": 7.914618843721704e-06, + "loss": 0.3728, + "step": 2302 + }, + { + "epoch": 0.3230014025245442, + "grad_norm": 2.4404516688498346, + "learning_rate": 7.912773091677968e-06, + "loss": 0.4176, + "step": 2303 + }, + { + "epoch": 0.32314165497896213, + "grad_norm": 2.3188365005028824, + "learning_rate": 7.910926738603855e-06, + "loss": 0.3491, + "step": 2304 + }, + { + "epoch": 0.3232819074333801, + "grad_norm": 2.428806059185959, + "learning_rate": 7.909079784880347e-06, + "loss": 0.3704, + "step": 2305 + }, + { + "epoch": 0.32342215988779804, + "grad_norm": 2.3858480160663675, + "learning_rate": 7.907232230888549e-06, + "loss": 0.3682, + "step": 2306 + }, + { + "epoch": 0.323562412342216, + "grad_norm": 2.6305744382168696, + "learning_rate": 7.905384077009693e-06, + "loss": 0.3828, + "step": 2307 + }, + { + "epoch": 0.32370266479663395, + "grad_norm": 1.9328989957854303, + "learning_rate": 7.90353532362513e-06, + "loss": 0.4115, + "step": 2308 + }, + { + "epoch": 0.3238429172510519, + "grad_norm": 2.29793965065034, + "learning_rate": 7.90168597111634e-06, + "loss": 0.3635, + "step": 2309 + }, + { + "epoch": 0.32398316970546986, + "grad_norm": 1.8021945000319568, + "learning_rate": 7.899836019864922e-06, + "loss": 0.3607, + "step": 2310 + }, + { + "epoch": 0.3241234221598878, + "grad_norm": 2.1721283469873813, + "learning_rate": 7.897985470252601e-06, + "loss": 0.3892, + "step": 2311 + }, + { + "epoch": 0.3242636746143058, + "grad_norm": 1.7412101159385331, + "learning_rate": 7.896134322661225e-06, + "loss": 0.3661, + "step": 2312 + }, + { + "epoch": 0.32440392706872373, + "grad_norm": 1.7566352123120275, + "learning_rate": 7.894282577472764e-06, + "loss": 0.3627, + "step": 2313 + }, + { + "epoch": 0.32454417952314163, + "grad_norm": 1.7557127894570657, + "learning_rate": 7.892430235069317e-06, + "loss": 0.3883, + "step": 2314 + }, + { + "epoch": 0.3246844319775596, + "grad_norm": 1.9306366474248546, + "learning_rate": 7.8905772958331e-06, + "loss": 0.3428, + "step": 2315 + }, + { + "epoch": 0.32482468443197754, + "grad_norm": 2.1882779509176644, + "learning_rate": 7.888723760146451e-06, + "loss": 0.359, + "step": 2316 + }, + { + "epoch": 0.3249649368863955, + "grad_norm": 1.8946200949689767, + "learning_rate": 7.886869628391835e-06, + "loss": 0.3839, + "step": 2317 + }, + { + "epoch": 0.32510518934081345, + "grad_norm": 1.9532042220261234, + "learning_rate": 7.885014900951842e-06, + "loss": 0.393, + "step": 2318 + }, + { + "epoch": 0.3252454417952314, + "grad_norm": 1.78474271229, + "learning_rate": 7.883159578209181e-06, + "loss": 0.3488, + "step": 2319 + }, + { + "epoch": 0.32538569424964936, + "grad_norm": 1.8161068249090417, + "learning_rate": 7.881303660546684e-06, + "loss": 0.3442, + "step": 2320 + }, + { + "epoch": 0.3255259467040673, + "grad_norm": 2.2887154273729875, + "learning_rate": 7.879447148347307e-06, + "loss": 0.3764, + "step": 2321 + }, + { + "epoch": 0.32566619915848527, + "grad_norm": 2.1621102571701, + "learning_rate": 7.877590041994128e-06, + "loss": 0.3973, + "step": 2322 + }, + { + "epoch": 0.3258064516129032, + "grad_norm": 2.3703114257488194, + "learning_rate": 7.875732341870349e-06, + "loss": 0.4186, + "step": 2323 + }, + { + "epoch": 0.3259467040673212, + "grad_norm": 2.213481719757349, + "learning_rate": 7.873874048359293e-06, + "loss": 0.4075, + "step": 2324 + }, + { + "epoch": 0.32608695652173914, + "grad_norm": 2.1171470722688244, + "learning_rate": 7.872015161844404e-06, + "loss": 0.3755, + "step": 2325 + }, + { + "epoch": 0.3262272089761571, + "grad_norm": 2.992194989944526, + "learning_rate": 7.870155682709253e-06, + "loss": 0.3333, + "step": 2326 + }, + { + "epoch": 0.32636746143057505, + "grad_norm": 2.5419015827950955, + "learning_rate": 7.868295611337529e-06, + "loss": 0.3672, + "step": 2327 + }, + { + "epoch": 0.326507713884993, + "grad_norm": 2.0264711946199756, + "learning_rate": 7.866434948113046e-06, + "loss": 0.3837, + "step": 2328 + }, + { + "epoch": 0.32664796633941096, + "grad_norm": 2.392167435256061, + "learning_rate": 7.864573693419736e-06, + "loss": 0.3492, + "step": 2329 + }, + { + "epoch": 0.3267882187938289, + "grad_norm": 2.6356328239804228, + "learning_rate": 7.86271184764166e-06, + "loss": 0.4173, + "step": 2330 + }, + { + "epoch": 0.32692847124824687, + "grad_norm": 2.710258552773532, + "learning_rate": 7.860849411162995e-06, + "loss": 0.3956, + "step": 2331 + }, + { + "epoch": 0.3270687237026648, + "grad_norm": 2.5767390369386733, + "learning_rate": 7.85898638436804e-06, + "loss": 0.3483, + "step": 2332 + }, + { + "epoch": 0.3272089761570828, + "grad_norm": 1.879747864474537, + "learning_rate": 7.857122767641218e-06, + "loss": 0.3355, + "step": 2333 + }, + { + "epoch": 0.3273492286115007, + "grad_norm": 2.936038851041767, + "learning_rate": 7.855258561367077e-06, + "loss": 0.3801, + "step": 2334 + }, + { + "epoch": 0.32748948106591863, + "grad_norm": 3.018427520018249, + "learning_rate": 7.853393765930279e-06, + "loss": 0.3858, + "step": 2335 + }, + { + "epoch": 0.3276297335203366, + "grad_norm": 2.141056657692691, + "learning_rate": 7.851528381715612e-06, + "loss": 0.4352, + "step": 2336 + }, + { + "epoch": 0.32776998597475454, + "grad_norm": 2.1496665074947843, + "learning_rate": 7.849662409107987e-06, + "loss": 0.3276, + "step": 2337 + }, + { + "epoch": 0.3279102384291725, + "grad_norm": 1.960200440998415, + "learning_rate": 7.847795848492432e-06, + "loss": 0.3958, + "step": 2338 + }, + { + "epoch": 0.32805049088359045, + "grad_norm": 2.8645196711084027, + "learning_rate": 7.845928700254101e-06, + "loss": 0.3691, + "step": 2339 + }, + { + "epoch": 0.3281907433380084, + "grad_norm": 2.1281922989341626, + "learning_rate": 7.844060964778264e-06, + "loss": 0.4168, + "step": 2340 + }, + { + "epoch": 0.32833099579242636, + "grad_norm": 1.9750802132457514, + "learning_rate": 7.842192642450319e-06, + "loss": 0.3629, + "step": 2341 + }, + { + "epoch": 0.3284712482468443, + "grad_norm": 2.8224021377019075, + "learning_rate": 7.84032373365578e-06, + "loss": 0.3811, + "step": 2342 + }, + { + "epoch": 0.32861150070126227, + "grad_norm": 2.608970839215009, + "learning_rate": 7.838454238780282e-06, + "loss": 0.3773, + "step": 2343 + }, + { + "epoch": 0.3287517531556802, + "grad_norm": 2.193014333459828, + "learning_rate": 7.836584158209581e-06, + "loss": 0.3571, + "step": 2344 + }, + { + "epoch": 0.3288920056100982, + "grad_norm": 6.386539860357833, + "learning_rate": 7.83471349232956e-06, + "loss": 0.3563, + "step": 2345 + }, + { + "epoch": 0.32903225806451614, + "grad_norm": 5.171146583546573, + "learning_rate": 7.832842241526212e-06, + "loss": 0.381, + "step": 2346 + }, + { + "epoch": 0.3291725105189341, + "grad_norm": 2.0223915018183787, + "learning_rate": 7.83097040618566e-06, + "loss": 0.3569, + "step": 2347 + }, + { + "epoch": 0.32931276297335205, + "grad_norm": 1.9955994951510179, + "learning_rate": 7.829097986694145e-06, + "loss": 0.4056, + "step": 2348 + }, + { + "epoch": 0.32945301542777, + "grad_norm": 2.2628873925471584, + "learning_rate": 7.827224983438024e-06, + "loss": 0.3753, + "step": 2349 + }, + { + "epoch": 0.32959326788218796, + "grad_norm": 2.3359155558190103, + "learning_rate": 7.825351396803783e-06, + "loss": 0.4256, + "step": 2350 + }, + { + "epoch": 0.3297335203366059, + "grad_norm": 2.09880594184664, + "learning_rate": 7.823477227178019e-06, + "loss": 0.4326, + "step": 2351 + }, + { + "epoch": 0.32987377279102387, + "grad_norm": 2.1746188608095944, + "learning_rate": 7.821602474947454e-06, + "loss": 0.3954, + "step": 2352 + }, + { + "epoch": 0.3300140252454418, + "grad_norm": 2.5494184840731022, + "learning_rate": 7.819727140498933e-06, + "loss": 0.3784, + "step": 2353 + }, + { + "epoch": 0.3301542776998597, + "grad_norm": 2.7994321076925286, + "learning_rate": 7.817851224219417e-06, + "loss": 0.4024, + "step": 2354 + }, + { + "epoch": 0.3302945301542777, + "grad_norm": 2.2650453023190544, + "learning_rate": 7.815974726495988e-06, + "loss": 0.3878, + "step": 2355 + }, + { + "epoch": 0.33043478260869563, + "grad_norm": 2.250966182527586, + "learning_rate": 7.814097647715848e-06, + "loss": 0.3697, + "step": 2356 + }, + { + "epoch": 0.3305750350631136, + "grad_norm": 2.2789900572402146, + "learning_rate": 7.812219988266318e-06, + "loss": 0.373, + "step": 2357 + }, + { + "epoch": 0.33071528751753154, + "grad_norm": 2.065844050117118, + "learning_rate": 7.810341748534843e-06, + "loss": 0.404, + "step": 2358 + }, + { + "epoch": 0.3308555399719495, + "grad_norm": 2.104844604156731, + "learning_rate": 7.808462928908982e-06, + "loss": 0.4121, + "step": 2359 + }, + { + "epoch": 0.33099579242636745, + "grad_norm": 2.038429600502812, + "learning_rate": 7.806583529776417e-06, + "loss": 0.3584, + "step": 2360 + }, + { + "epoch": 0.3311360448807854, + "grad_norm": 1.806893721714098, + "learning_rate": 7.804703551524948e-06, + "loss": 0.3586, + "step": 2361 + }, + { + "epoch": 0.33127629733520336, + "grad_norm": 2.1756104106024874, + "learning_rate": 7.802822994542498e-06, + "loss": 0.4208, + "step": 2362 + }, + { + "epoch": 0.3314165497896213, + "grad_norm": 1.8956128365981368, + "learning_rate": 7.800941859217103e-06, + "loss": 0.3645, + "step": 2363 + }, + { + "epoch": 0.3315568022440393, + "grad_norm": 1.9215483349079334, + "learning_rate": 7.799060145936928e-06, + "loss": 0.3531, + "step": 2364 + }, + { + "epoch": 0.3316970546984572, + "grad_norm": 2.000847279368834, + "learning_rate": 7.797177855090246e-06, + "loss": 0.3398, + "step": 2365 + }, + { + "epoch": 0.3318373071528752, + "grad_norm": 1.7520650532328665, + "learning_rate": 7.795294987065456e-06, + "loss": 0.3294, + "step": 2366 + }, + { + "epoch": 0.33197755960729314, + "grad_norm": 2.994069595639065, + "learning_rate": 7.793411542251074e-06, + "loss": 0.3965, + "step": 2367 + }, + { + "epoch": 0.3321178120617111, + "grad_norm": 2.7628920678213573, + "learning_rate": 7.791527521035736e-06, + "loss": 0.3472, + "step": 2368 + }, + { + "epoch": 0.33225806451612905, + "grad_norm": 2.155078625052337, + "learning_rate": 7.789642923808199e-06, + "loss": 0.3909, + "step": 2369 + }, + { + "epoch": 0.332398316970547, + "grad_norm": 2.462173296948808, + "learning_rate": 7.787757750957335e-06, + "loss": 0.367, + "step": 2370 + }, + { + "epoch": 0.33253856942496496, + "grad_norm": 5.262808660665833, + "learning_rate": 7.785872002872134e-06, + "loss": 0.3641, + "step": 2371 + }, + { + "epoch": 0.3326788218793829, + "grad_norm": 2.6289115100824976, + "learning_rate": 7.78398567994171e-06, + "loss": 0.3994, + "step": 2372 + }, + { + "epoch": 0.33281907433380087, + "grad_norm": 5.153026336040004, + "learning_rate": 7.78209878255529e-06, + "loss": 0.3913, + "step": 2373 + }, + { + "epoch": 0.33295932678821877, + "grad_norm": 1.9966613201500707, + "learning_rate": 7.780211311102226e-06, + "loss": 0.3873, + "step": 2374 + }, + { + "epoch": 0.3330995792426367, + "grad_norm": 2.1044308965480303, + "learning_rate": 7.77832326597198e-06, + "loss": 0.3905, + "step": 2375 + }, + { + "epoch": 0.3332398316970547, + "grad_norm": 2.6982361789654417, + "learning_rate": 7.77643464755414e-06, + "loss": 0.3305, + "step": 2376 + }, + { + "epoch": 0.33338008415147263, + "grad_norm": 2.3561274934284238, + "learning_rate": 7.77454545623841e-06, + "loss": 0.3842, + "step": 2377 + }, + { + "epoch": 0.3335203366058906, + "grad_norm": 2.310475674819604, + "learning_rate": 7.772655692414606e-06, + "loss": 0.4197, + "step": 2378 + }, + { + "epoch": 0.33366058906030854, + "grad_norm": 2.2470347680600873, + "learning_rate": 7.770765356472672e-06, + "loss": 0.373, + "step": 2379 + }, + { + "epoch": 0.3338008415147265, + "grad_norm": 2.0585568042892564, + "learning_rate": 7.768874448802665e-06, + "loss": 0.3725, + "step": 2380 + }, + { + "epoch": 0.33394109396914445, + "grad_norm": 2.0365224634660084, + "learning_rate": 7.766982969794762e-06, + "loss": 0.4236, + "step": 2381 + }, + { + "epoch": 0.3340813464235624, + "grad_norm": 2.223542852847889, + "learning_rate": 7.765090919839253e-06, + "loss": 0.3733, + "step": 2382 + }, + { + "epoch": 0.33422159887798036, + "grad_norm": 2.993459565602722, + "learning_rate": 7.763198299326553e-06, + "loss": 0.3895, + "step": 2383 + }, + { + "epoch": 0.3343618513323983, + "grad_norm": 1.876552701824441, + "learning_rate": 7.761305108647188e-06, + "loss": 0.4387, + "step": 2384 + }, + { + "epoch": 0.3345021037868163, + "grad_norm": 1.9082531644259433, + "learning_rate": 7.759411348191806e-06, + "loss": 0.4458, + "step": 2385 + }, + { + "epoch": 0.33464235624123423, + "grad_norm": 1.869581403749016, + "learning_rate": 7.75751701835117e-06, + "loss": 0.366, + "step": 2386 + }, + { + "epoch": 0.3347826086956522, + "grad_norm": 3.364589458889104, + "learning_rate": 7.755622119516163e-06, + "loss": 0.3282, + "step": 2387 + }, + { + "epoch": 0.33492286115007014, + "grad_norm": 3.0999076521869813, + "learning_rate": 7.753726652077787e-06, + "loss": 0.3788, + "step": 2388 + }, + { + "epoch": 0.3350631136044881, + "grad_norm": 1.8586692432338272, + "learning_rate": 7.751830616427151e-06, + "loss": 0.334, + "step": 2389 + }, + { + "epoch": 0.33520336605890605, + "grad_norm": 3.339386990193841, + "learning_rate": 7.749934012955497e-06, + "loss": 0.3783, + "step": 2390 + }, + { + "epoch": 0.335343618513324, + "grad_norm": 2.4828943602773106, + "learning_rate": 7.74803684205417e-06, + "loss": 0.4059, + "step": 2391 + }, + { + "epoch": 0.33548387096774196, + "grad_norm": 2.226278400700491, + "learning_rate": 7.74613910411464e-06, + "loss": 0.314, + "step": 2392 + }, + { + "epoch": 0.3356241234221599, + "grad_norm": 2.8409002088928093, + "learning_rate": 7.744240799528492e-06, + "loss": 0.3928, + "step": 2393 + }, + { + "epoch": 0.3357643758765778, + "grad_norm": 2.353856688040037, + "learning_rate": 7.742341928687427e-06, + "loss": 0.425, + "step": 2394 + }, + { + "epoch": 0.33590462833099577, + "grad_norm": 2.5053773269174315, + "learning_rate": 7.740442491983266e-06, + "loss": 0.3987, + "step": 2395 + }, + { + "epoch": 0.3360448807854137, + "grad_norm": 2.6549699279461416, + "learning_rate": 7.738542489807942e-06, + "loss": 0.3777, + "step": 2396 + }, + { + "epoch": 0.3361851332398317, + "grad_norm": 2.4223000430322665, + "learning_rate": 7.736641922553509e-06, + "loss": 0.4271, + "step": 2397 + }, + { + "epoch": 0.33632538569424963, + "grad_norm": 2.9602781180583158, + "learning_rate": 7.734740790612137e-06, + "loss": 0.3811, + "step": 2398 + }, + { + "epoch": 0.3364656381486676, + "grad_norm": 5.159054634965838, + "learning_rate": 7.732839094376106e-06, + "loss": 0.3788, + "step": 2399 + }, + { + "epoch": 0.33660589060308554, + "grad_norm": 4.063659737182079, + "learning_rate": 7.730936834237821e-06, + "loss": 0.3857, + "step": 2400 + }, + { + "epoch": 0.3367461430575035, + "grad_norm": 2.183506835954752, + "learning_rate": 7.7290340105898e-06, + "loss": 0.3857, + "step": 2401 + }, + { + "epoch": 0.33688639551192145, + "grad_norm": 2.299229459643657, + "learning_rate": 7.72713062382468e-06, + "loss": 0.4217, + "step": 2402 + }, + { + "epoch": 0.3370266479663394, + "grad_norm": 1.8967634441622792, + "learning_rate": 7.725226674335208e-06, + "loss": 0.3645, + "step": 2403 + }, + { + "epoch": 0.33716690042075736, + "grad_norm": 1.9791544328278008, + "learning_rate": 7.72332216251425e-06, + "loss": 0.4066, + "step": 2404 + }, + { + "epoch": 0.3373071528751753, + "grad_norm": 1.7195509691507262, + "learning_rate": 7.72141708875479e-06, + "loss": 0.3532, + "step": 2405 + }, + { + "epoch": 0.3374474053295933, + "grad_norm": 2.0027431510910088, + "learning_rate": 7.71951145344993e-06, + "loss": 0.3883, + "step": 2406 + }, + { + "epoch": 0.33758765778401123, + "grad_norm": 4.2355480188109444, + "learning_rate": 7.71760525699288e-06, + "loss": 0.3466, + "step": 2407 + }, + { + "epoch": 0.3377279102384292, + "grad_norm": 2.4843475532215202, + "learning_rate": 7.715698499776973e-06, + "loss": 0.3571, + "step": 2408 + }, + { + "epoch": 0.33786816269284714, + "grad_norm": 2.0882875156272913, + "learning_rate": 7.713791182195653e-06, + "loss": 0.3594, + "step": 2409 + }, + { + "epoch": 0.3380084151472651, + "grad_norm": 3.5033987436863625, + "learning_rate": 7.711883304642482e-06, + "loss": 0.3713, + "step": 2410 + }, + { + "epoch": 0.33814866760168305, + "grad_norm": 2.610094713560664, + "learning_rate": 7.709974867511139e-06, + "loss": 0.3516, + "step": 2411 + }, + { + "epoch": 0.338288920056101, + "grad_norm": 2.9864461213097147, + "learning_rate": 7.708065871195413e-06, + "loss": 0.3503, + "step": 2412 + }, + { + "epoch": 0.33842917251051896, + "grad_norm": 3.567500436484745, + "learning_rate": 7.706156316089218e-06, + "loss": 0.42, + "step": 2413 + }, + { + "epoch": 0.33856942496493686, + "grad_norm": 2.7769600003541455, + "learning_rate": 7.704246202586572e-06, + "loss": 0.3487, + "step": 2414 + }, + { + "epoch": 0.3387096774193548, + "grad_norm": 2.391267577932771, + "learning_rate": 7.702335531081616e-06, + "loss": 0.3733, + "step": 2415 + }, + { + "epoch": 0.33884992987377277, + "grad_norm": 3.2589702058584917, + "learning_rate": 7.700424301968603e-06, + "loss": 0.4378, + "step": 2416 + }, + { + "epoch": 0.3389901823281907, + "grad_norm": 2.8489535167218674, + "learning_rate": 7.698512515641903e-06, + "loss": 0.4128, + "step": 2417 + }, + { + "epoch": 0.3391304347826087, + "grad_norm": 3.465459826895792, + "learning_rate": 7.696600172495997e-06, + "loss": 0.3582, + "step": 2418 + }, + { + "epoch": 0.33927068723702664, + "grad_norm": 4.119207534499843, + "learning_rate": 7.694687272925487e-06, + "loss": 0.3811, + "step": 2419 + }, + { + "epoch": 0.3394109396914446, + "grad_norm": 2.8037132334923984, + "learning_rate": 7.692773817325082e-06, + "loss": 0.3816, + "step": 2420 + }, + { + "epoch": 0.33955119214586255, + "grad_norm": 2.214389007549896, + "learning_rate": 7.690859806089615e-06, + "loss": 0.4281, + "step": 2421 + }, + { + "epoch": 0.3396914446002805, + "grad_norm": 3.14416744425953, + "learning_rate": 7.688945239614027e-06, + "loss": 0.3599, + "step": 2422 + }, + { + "epoch": 0.33983169705469846, + "grad_norm": 3.3112959202843673, + "learning_rate": 7.687030118293375e-06, + "loss": 0.4218, + "step": 2423 + }, + { + "epoch": 0.3399719495091164, + "grad_norm": 1.9679367565599826, + "learning_rate": 7.685114442522831e-06, + "loss": 0.3866, + "step": 2424 + }, + { + "epoch": 0.34011220196353437, + "grad_norm": 2.8687340829445067, + "learning_rate": 7.683198212697682e-06, + "loss": 0.3192, + "step": 2425 + }, + { + "epoch": 0.3402524544179523, + "grad_norm": 2.141744947177933, + "learning_rate": 7.681281429213328e-06, + "loss": 0.3446, + "step": 2426 + }, + { + "epoch": 0.3403927068723703, + "grad_norm": 3.884461449960031, + "learning_rate": 7.679364092465282e-06, + "loss": 0.3409, + "step": 2427 + }, + { + "epoch": 0.34053295932678823, + "grad_norm": 3.252291678369861, + "learning_rate": 7.677446202849178e-06, + "loss": 0.4001, + "step": 2428 + }, + { + "epoch": 0.3406732117812062, + "grad_norm": 2.5693135740843074, + "learning_rate": 7.675527760760755e-06, + "loss": 0.3625, + "step": 2429 + }, + { + "epoch": 0.34081346423562414, + "grad_norm": 1.94779264765855, + "learning_rate": 7.67360876659587e-06, + "loss": 0.3817, + "step": 2430 + }, + { + "epoch": 0.3409537166900421, + "grad_norm": 2.9129512191758122, + "learning_rate": 7.671689220750497e-06, + "loss": 0.4316, + "step": 2431 + }, + { + "epoch": 0.34109396914446005, + "grad_norm": 2.414421277097682, + "learning_rate": 7.669769123620719e-06, + "loss": 0.3527, + "step": 2432 + }, + { + "epoch": 0.341234221598878, + "grad_norm": 2.5488487038201817, + "learning_rate": 7.667848475602735e-06, + "loss": 0.3615, + "step": 2433 + }, + { + "epoch": 0.3413744740532959, + "grad_norm": 2.1510068753610114, + "learning_rate": 7.665927277092855e-06, + "loss": 0.3252, + "step": 2434 + }, + { + "epoch": 0.34151472650771386, + "grad_norm": 4.5114987010055545, + "learning_rate": 7.664005528487508e-06, + "loss": 0.3551, + "step": 2435 + }, + { + "epoch": 0.3416549789621318, + "grad_norm": 2.224840428491665, + "learning_rate": 7.662083230183234e-06, + "loss": 0.4257, + "step": 2436 + }, + { + "epoch": 0.34179523141654977, + "grad_norm": 2.1687518380896615, + "learning_rate": 7.660160382576683e-06, + "loss": 0.3622, + "step": 2437 + }, + { + "epoch": 0.3419354838709677, + "grad_norm": 2.0700605131140737, + "learning_rate": 7.658236986064624e-06, + "loss": 0.4092, + "step": 2438 + }, + { + "epoch": 0.3420757363253857, + "grad_norm": 2.952289281394626, + "learning_rate": 7.656313041043934e-06, + "loss": 0.3376, + "step": 2439 + }, + { + "epoch": 0.34221598877980364, + "grad_norm": 2.1084519238598425, + "learning_rate": 7.654388547911605e-06, + "loss": 0.3996, + "step": 2440 + }, + { + "epoch": 0.3423562412342216, + "grad_norm": 2.488196546708823, + "learning_rate": 7.652463507064745e-06, + "loss": 0.3486, + "step": 2441 + }, + { + "epoch": 0.34249649368863955, + "grad_norm": 2.6391789014858107, + "learning_rate": 7.650537918900573e-06, + "loss": 0.3781, + "step": 2442 + }, + { + "epoch": 0.3426367461430575, + "grad_norm": 2.290251647582189, + "learning_rate": 7.648611783816417e-06, + "loss": 0.3674, + "step": 2443 + }, + { + "epoch": 0.34277699859747546, + "grad_norm": 2.127596222867731, + "learning_rate": 7.646685102209726e-06, + "loss": 0.4083, + "step": 2444 + }, + { + "epoch": 0.3429172510518934, + "grad_norm": 3.774445519556818, + "learning_rate": 7.644757874478056e-06, + "loss": 0.3805, + "step": 2445 + }, + { + "epoch": 0.34305750350631137, + "grad_norm": 2.7730142619677265, + "learning_rate": 7.642830101019075e-06, + "loss": 0.4198, + "step": 2446 + }, + { + "epoch": 0.3431977559607293, + "grad_norm": 2.4049010025237414, + "learning_rate": 7.640901782230567e-06, + "loss": 0.3435, + "step": 2447 + }, + { + "epoch": 0.3433380084151473, + "grad_norm": 3.2781681665245817, + "learning_rate": 7.638972918510428e-06, + "loss": 0.3723, + "step": 2448 + }, + { + "epoch": 0.34347826086956523, + "grad_norm": 2.103059823782007, + "learning_rate": 7.637043510256663e-06, + "loss": 0.3603, + "step": 2449 + }, + { + "epoch": 0.3436185133239832, + "grad_norm": 2.5055351969344577, + "learning_rate": 7.635113557867395e-06, + "loss": 0.36, + "step": 2450 + }, + { + "epoch": 0.34375876577840114, + "grad_norm": 2.0768178399640798, + "learning_rate": 7.633183061740853e-06, + "loss": 0.4151, + "step": 2451 + }, + { + "epoch": 0.3438990182328191, + "grad_norm": 3.3709916224210077, + "learning_rate": 7.631252022275386e-06, + "loss": 0.4023, + "step": 2452 + }, + { + "epoch": 0.34403927068723705, + "grad_norm": 4.505497647148638, + "learning_rate": 7.6293204398694455e-06, + "loss": 0.3903, + "step": 2453 + }, + { + "epoch": 0.34417952314165495, + "grad_norm": 2.319049427204839, + "learning_rate": 7.627388314921602e-06, + "loss": 0.3493, + "step": 2454 + }, + { + "epoch": 0.3443197755960729, + "grad_norm": 1.861841313165259, + "learning_rate": 7.625455647830537e-06, + "loss": 0.3726, + "step": 2455 + }, + { + "epoch": 0.34446002805049086, + "grad_norm": 2.9880319927703813, + "learning_rate": 7.62352243899504e-06, + "loss": 0.3716, + "step": 2456 + }, + { + "epoch": 0.3446002805049088, + "grad_norm": 2.859842230973515, + "learning_rate": 7.621588688814019e-06, + "loss": 0.4044, + "step": 2457 + }, + { + "epoch": 0.3447405329593268, + "grad_norm": 3.1532608834694837, + "learning_rate": 7.619654397686488e-06, + "loss": 0.3611, + "step": 2458 + }, + { + "epoch": 0.34488078541374473, + "grad_norm": 2.467739429549011, + "learning_rate": 7.617719566011575e-06, + "loss": 0.378, + "step": 2459 + }, + { + "epoch": 0.3450210378681627, + "grad_norm": 2.0329663581297535, + "learning_rate": 7.615784194188516e-06, + "loss": 0.3856, + "step": 2460 + }, + { + "epoch": 0.34516129032258064, + "grad_norm": 1.9632030636952378, + "learning_rate": 7.613848282616665e-06, + "loss": 0.3656, + "step": 2461 + }, + { + "epoch": 0.3453015427769986, + "grad_norm": 2.4892160525135836, + "learning_rate": 7.611911831695482e-06, + "loss": 0.3964, + "step": 2462 + }, + { + "epoch": 0.34544179523141655, + "grad_norm": 2.1418058382327154, + "learning_rate": 7.609974841824543e-06, + "loss": 0.3911, + "step": 2463 + }, + { + "epoch": 0.3455820476858345, + "grad_norm": 2.8403491130680476, + "learning_rate": 7.608037313403529e-06, + "loss": 0.3763, + "step": 2464 + }, + { + "epoch": 0.34572230014025246, + "grad_norm": 3.4438748982621843, + "learning_rate": 7.606099246832234e-06, + "loss": 0.3976, + "step": 2465 + }, + { + "epoch": 0.3458625525946704, + "grad_norm": 2.3548973997407328, + "learning_rate": 7.60416064251057e-06, + "loss": 0.3882, + "step": 2466 + }, + { + "epoch": 0.34600280504908837, + "grad_norm": 1.840302428394339, + "learning_rate": 7.602221500838553e-06, + "loss": 0.4043, + "step": 2467 + }, + { + "epoch": 0.3461430575035063, + "grad_norm": 2.402767286724887, + "learning_rate": 7.600281822216307e-06, + "loss": 0.3924, + "step": 2468 + }, + { + "epoch": 0.3462833099579243, + "grad_norm": 2.2058958120496452, + "learning_rate": 7.598341607044075e-06, + "loss": 0.3676, + "step": 2469 + }, + { + "epoch": 0.34642356241234223, + "grad_norm": 3.0197888026377973, + "learning_rate": 7.596400855722206e-06, + "loss": 0.4034, + "step": 2470 + }, + { + "epoch": 0.3465638148667602, + "grad_norm": 3.4526554668760707, + "learning_rate": 7.594459568651159e-06, + "loss": 0.4048, + "step": 2471 + }, + { + "epoch": 0.34670406732117814, + "grad_norm": 2.188374371847153, + "learning_rate": 7.592517746231507e-06, + "loss": 0.3253, + "step": 2472 + }, + { + "epoch": 0.3468443197755961, + "grad_norm": 1.8931366182048215, + "learning_rate": 7.590575388863932e-06, + "loss": 0.3797, + "step": 2473 + }, + { + "epoch": 0.346984572230014, + "grad_norm": 2.881291678369242, + "learning_rate": 7.588632496949223e-06, + "loss": 0.4079, + "step": 2474 + }, + { + "epoch": 0.34712482468443195, + "grad_norm": 2.4911064266592207, + "learning_rate": 7.586689070888284e-06, + "loss": 0.3704, + "step": 2475 + }, + { + "epoch": 0.3472650771388499, + "grad_norm": 2.190078741251853, + "learning_rate": 7.584745111082128e-06, + "loss": 0.4035, + "step": 2476 + }, + { + "epoch": 0.34740532959326786, + "grad_norm": 1.7503280332203313, + "learning_rate": 7.582800617931876e-06, + "loss": 0.3706, + "step": 2477 + }, + { + "epoch": 0.3475455820476858, + "grad_norm": 2.190366882702773, + "learning_rate": 7.580855591838763e-06, + "loss": 0.4068, + "step": 2478 + }, + { + "epoch": 0.3476858345021038, + "grad_norm": 2.887774902733068, + "learning_rate": 7.578910033204129e-06, + "loss": 0.3917, + "step": 2479 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 2.0859305278075775, + "learning_rate": 7.576963942429427e-06, + "loss": 0.4256, + "step": 2480 + }, + { + "epoch": 0.3479663394109397, + "grad_norm": 2.963583214618868, + "learning_rate": 7.5750173199162204e-06, + "loss": 0.4112, + "step": 2481 + }, + { + "epoch": 0.34810659186535764, + "grad_norm": 2.2355417259859562, + "learning_rate": 7.5730701660661795e-06, + "loss": 0.4027, + "step": 2482 + }, + { + "epoch": 0.3482468443197756, + "grad_norm": 2.409134857045145, + "learning_rate": 7.571122481281088e-06, + "loss": 0.3822, + "step": 2483 + }, + { + "epoch": 0.34838709677419355, + "grad_norm": 2.283823834474823, + "learning_rate": 7.569174265962834e-06, + "loss": 0.3528, + "step": 2484 + }, + { + "epoch": 0.3485273492286115, + "grad_norm": 2.3789784086962555, + "learning_rate": 7.567225520513422e-06, + "loss": 0.3977, + "step": 2485 + }, + { + "epoch": 0.34866760168302946, + "grad_norm": 2.490071034777873, + "learning_rate": 7.565276245334957e-06, + "loss": 0.3935, + "step": 2486 + }, + { + "epoch": 0.3488078541374474, + "grad_norm": 1.9318882674493174, + "learning_rate": 7.563326440829662e-06, + "loss": 0.3977, + "step": 2487 + }, + { + "epoch": 0.34894810659186537, + "grad_norm": 2.3107523113380735, + "learning_rate": 7.561376107399867e-06, + "loss": 0.4, + "step": 2488 + }, + { + "epoch": 0.3490883590462833, + "grad_norm": 2.1177477781491043, + "learning_rate": 7.559425245448006e-06, + "loss": 0.3658, + "step": 2489 + }, + { + "epoch": 0.3492286115007013, + "grad_norm": 2.581725409742212, + "learning_rate": 7.557473855376627e-06, + "loss": 0.379, + "step": 2490 + }, + { + "epoch": 0.34936886395511924, + "grad_norm": 2.2414967606074563, + "learning_rate": 7.555521937588386e-06, + "loss": 0.3653, + "step": 2491 + }, + { + "epoch": 0.3495091164095372, + "grad_norm": 2.2774972225972063, + "learning_rate": 7.553569492486048e-06, + "loss": 0.3768, + "step": 2492 + }, + { + "epoch": 0.34964936886395515, + "grad_norm": 3.2972873231038076, + "learning_rate": 7.551616520472485e-06, + "loss": 0.3989, + "step": 2493 + }, + { + "epoch": 0.34978962131837305, + "grad_norm": 1.9989039875042462, + "learning_rate": 7.5496630219506805e-06, + "loss": 0.4047, + "step": 2494 + }, + { + "epoch": 0.349929873772791, + "grad_norm": 2.494488362541893, + "learning_rate": 7.547708997323724e-06, + "loss": 0.3927, + "step": 2495 + }, + { + "epoch": 0.35007012622720896, + "grad_norm": 2.2986865522663305, + "learning_rate": 7.5457544469948164e-06, + "loss": 0.3916, + "step": 2496 + }, + { + "epoch": 0.3502103786816269, + "grad_norm": 2.8952696729643357, + "learning_rate": 7.543799371367264e-06, + "loss": 0.3776, + "step": 2497 + }, + { + "epoch": 0.35035063113604487, + "grad_norm": 3.6559564195452015, + "learning_rate": 7.541843770844486e-06, + "loss": 0.3756, + "step": 2498 + }, + { + "epoch": 0.3504908835904628, + "grad_norm": 3.529219862451816, + "learning_rate": 7.539887645830002e-06, + "loss": 0.3758, + "step": 2499 + }, + { + "epoch": 0.3506311360448808, + "grad_norm": 1.9723541330570262, + "learning_rate": 7.537930996727448e-06, + "loss": 0.3996, + "step": 2500 + }, + { + "epoch": 0.35077138849929873, + "grad_norm": 2.571584647603979, + "learning_rate": 7.535973823940566e-06, + "loss": 0.3423, + "step": 2501 + }, + { + "epoch": 0.3509116409537167, + "grad_norm": 2.3237401645856504, + "learning_rate": 7.5340161278732e-06, + "loss": 0.4229, + "step": 2502 + }, + { + "epoch": 0.35105189340813464, + "grad_norm": 2.8951128788421694, + "learning_rate": 7.532057908929311e-06, + "loss": 0.3902, + "step": 2503 + }, + { + "epoch": 0.3511921458625526, + "grad_norm": 2.360675983600532, + "learning_rate": 7.530099167512965e-06, + "loss": 0.4054, + "step": 2504 + }, + { + "epoch": 0.35133239831697055, + "grad_norm": 2.5494885286984696, + "learning_rate": 7.528139904028331e-06, + "loss": 0.4239, + "step": 2505 + }, + { + "epoch": 0.3514726507713885, + "grad_norm": 2.5976076049839043, + "learning_rate": 7.5261801188796904e-06, + "loss": 0.394, + "step": 2506 + }, + { + "epoch": 0.35161290322580646, + "grad_norm": 3.608933458940407, + "learning_rate": 7.524219812471432e-06, + "loss": 0.3978, + "step": 2507 + }, + { + "epoch": 0.3517531556802244, + "grad_norm": 2.094553551275927, + "learning_rate": 7.5222589852080505e-06, + "loss": 0.4001, + "step": 2508 + }, + { + "epoch": 0.35189340813464237, + "grad_norm": 2.2619169211001062, + "learning_rate": 7.520297637494149e-06, + "loss": 0.4428, + "step": 2509 + }, + { + "epoch": 0.3520336605890603, + "grad_norm": 1.9980056475043073, + "learning_rate": 7.5183357697344395e-06, + "loss": 0.3501, + "step": 2510 + }, + { + "epoch": 0.3521739130434783, + "grad_norm": 2.8890903007362208, + "learning_rate": 7.516373382333737e-06, + "loss": 0.3937, + "step": 2511 + }, + { + "epoch": 0.35231416549789624, + "grad_norm": 5.590624228957727, + "learning_rate": 7.51441047569697e-06, + "loss": 0.3872, + "step": 2512 + }, + { + "epoch": 0.3524544179523142, + "grad_norm": 2.074817520758344, + "learning_rate": 7.512447050229166e-06, + "loss": 0.4052, + "step": 2513 + }, + { + "epoch": 0.3525946704067321, + "grad_norm": 2.200763708551853, + "learning_rate": 7.510483106335468e-06, + "loss": 0.3893, + "step": 2514 + }, + { + "epoch": 0.35273492286115005, + "grad_norm": 2.684573509683436, + "learning_rate": 7.508518644421119e-06, + "loss": 0.3639, + "step": 2515 + }, + { + "epoch": 0.352875175315568, + "grad_norm": 2.3800293725068524, + "learning_rate": 7.506553664891475e-06, + "loss": 0.3835, + "step": 2516 + }, + { + "epoch": 0.35301542776998596, + "grad_norm": 2.1144641660990136, + "learning_rate": 7.504588168151994e-06, + "loss": 0.3707, + "step": 2517 + }, + { + "epoch": 0.3531556802244039, + "grad_norm": 2.574608734181712, + "learning_rate": 7.502622154608243e-06, + "loss": 0.3698, + "step": 2518 + }, + { + "epoch": 0.35329593267882187, + "grad_norm": 2.7953633746767834, + "learning_rate": 7.500655624665896e-06, + "loss": 0.389, + "step": 2519 + }, + { + "epoch": 0.3534361851332398, + "grad_norm": 2.5719559352001125, + "learning_rate": 7.498688578730731e-06, + "loss": 0.3512, + "step": 2520 + }, + { + "epoch": 0.3535764375876578, + "grad_norm": 3.540543606286348, + "learning_rate": 7.496721017208634e-06, + "loss": 0.3497, + "step": 2521 + }, + { + "epoch": 0.35371669004207573, + "grad_norm": 2.952888600409061, + "learning_rate": 7.4947529405056005e-06, + "loss": 0.3782, + "step": 2522 + }, + { + "epoch": 0.3538569424964937, + "grad_norm": 2.6493845458869996, + "learning_rate": 7.492784349027726e-06, + "loss": 0.4001, + "step": 2523 + }, + { + "epoch": 0.35399719495091164, + "grad_norm": 2.7959957339045514, + "learning_rate": 7.4908152431812175e-06, + "loss": 0.4351, + "step": 2524 + }, + { + "epoch": 0.3541374474053296, + "grad_norm": 2.281654740054621, + "learning_rate": 7.488845623372386e-06, + "loss": 0.4025, + "step": 2525 + }, + { + "epoch": 0.35427769985974755, + "grad_norm": 3.563813486058933, + "learning_rate": 7.486875490007648e-06, + "loss": 0.3807, + "step": 2526 + }, + { + "epoch": 0.3544179523141655, + "grad_norm": 3.1964825490913458, + "learning_rate": 7.484904843493528e-06, + "loss": 0.36, + "step": 2527 + }, + { + "epoch": 0.35455820476858346, + "grad_norm": 4.18792380850054, + "learning_rate": 7.482933684236654e-06, + "loss": 0.3721, + "step": 2528 + }, + { + "epoch": 0.3546984572230014, + "grad_norm": 4.316806577872319, + "learning_rate": 7.480962012643762e-06, + "loss": 0.4127, + "step": 2529 + }, + { + "epoch": 0.3548387096774194, + "grad_norm": 2.639916138617495, + "learning_rate": 7.478989829121691e-06, + "loss": 0.4203, + "step": 2530 + }, + { + "epoch": 0.35497896213183733, + "grad_norm": 4.334808807556717, + "learning_rate": 7.477017134077389e-06, + "loss": 0.3761, + "step": 2531 + }, + { + "epoch": 0.3551192145862553, + "grad_norm": 2.9008196889075526, + "learning_rate": 7.475043927917908e-06, + "loss": 0.3883, + "step": 2532 + }, + { + "epoch": 0.35525946704067324, + "grad_norm": 2.070973071392494, + "learning_rate": 7.473070211050404e-06, + "loss": 0.3827, + "step": 2533 + }, + { + "epoch": 0.35539971949509114, + "grad_norm": 2.356417286244183, + "learning_rate": 7.47109598388214e-06, + "loss": 0.4041, + "step": 2534 + }, + { + "epoch": 0.3555399719495091, + "grad_norm": 2.879972718427506, + "learning_rate": 7.469121246820483e-06, + "loss": 0.363, + "step": 2535 + }, + { + "epoch": 0.35568022440392705, + "grad_norm": 2.394098189804574, + "learning_rate": 7.467146000272909e-06, + "loss": 0.3474, + "step": 2536 + }, + { + "epoch": 0.355820476858345, + "grad_norm": 4.893085092558933, + "learning_rate": 7.4651702446469944e-06, + "loss": 0.371, + "step": 2537 + }, + { + "epoch": 0.35596072931276296, + "grad_norm": 2.3821635378758743, + "learning_rate": 7.4631939803504215e-06, + "loss": 0.3941, + "step": 2538 + }, + { + "epoch": 0.3561009817671809, + "grad_norm": 2.992127420404914, + "learning_rate": 7.4612172077909815e-06, + "loss": 0.3235, + "step": 2539 + }, + { + "epoch": 0.35624123422159887, + "grad_norm": 2.1565393447240924, + "learning_rate": 7.459239927376566e-06, + "loss": 0.4045, + "step": 2540 + }, + { + "epoch": 0.3563814866760168, + "grad_norm": 3.4030323755078915, + "learning_rate": 7.457262139515172e-06, + "loss": 0.4094, + "step": 2541 + }, + { + "epoch": 0.3565217391304348, + "grad_norm": 2.5291703711728193, + "learning_rate": 7.455283844614906e-06, + "loss": 0.4277, + "step": 2542 + }, + { + "epoch": 0.35666199158485273, + "grad_norm": 2.4378799729190153, + "learning_rate": 7.453305043083969e-06, + "loss": 0.3948, + "step": 2543 + }, + { + "epoch": 0.3568022440392707, + "grad_norm": 2.3756464028341484, + "learning_rate": 7.451325735330679e-06, + "loss": 0.3698, + "step": 2544 + }, + { + "epoch": 0.35694249649368864, + "grad_norm": 2.795381797413467, + "learning_rate": 7.449345921763449e-06, + "loss": 0.3866, + "step": 2545 + }, + { + "epoch": 0.3570827489481066, + "grad_norm": 2.75375785698378, + "learning_rate": 7.4473656027908005e-06, + "loss": 0.4009, + "step": 2546 + }, + { + "epoch": 0.35722300140252455, + "grad_norm": 2.2805925231778916, + "learning_rate": 7.445384778821358e-06, + "loss": 0.3339, + "step": 2547 + }, + { + "epoch": 0.3573632538569425, + "grad_norm": 1.807017881852776, + "learning_rate": 7.443403450263852e-06, + "loss": 0.3592, + "step": 2548 + }, + { + "epoch": 0.35750350631136046, + "grad_norm": 2.623471950647878, + "learning_rate": 7.441421617527116e-06, + "loss": 0.3805, + "step": 2549 + }, + { + "epoch": 0.3576437587657784, + "grad_norm": 4.469224811218161, + "learning_rate": 7.439439281020085e-06, + "loss": 0.3692, + "step": 2550 + }, + { + "epoch": 0.3577840112201964, + "grad_norm": 2.238702278416659, + "learning_rate": 7.4374564411518e-06, + "loss": 0.383, + "step": 2551 + }, + { + "epoch": 0.35792426367461433, + "grad_norm": 2.512817901692813, + "learning_rate": 7.435473098331411e-06, + "loss": 0.3578, + "step": 2552 + }, + { + "epoch": 0.3580645161290323, + "grad_norm": 2.3466446358376287, + "learning_rate": 7.4334892529681625e-06, + "loss": 0.3449, + "step": 2553 + }, + { + "epoch": 0.3582047685834502, + "grad_norm": 2.1224095880417666, + "learning_rate": 7.431504905471407e-06, + "loss": 0.3974, + "step": 2554 + }, + { + "epoch": 0.35834502103786814, + "grad_norm": 3.0151948770327226, + "learning_rate": 7.4295200562506045e-06, + "loss": 0.4193, + "step": 2555 + }, + { + "epoch": 0.3584852734922861, + "grad_norm": 2.840051278739372, + "learning_rate": 7.427534705715311e-06, + "loss": 0.3618, + "step": 2556 + }, + { + "epoch": 0.35862552594670405, + "grad_norm": 1.8249205193130171, + "learning_rate": 7.425548854275191e-06, + "loss": 0.3699, + "step": 2557 + }, + { + "epoch": 0.358765778401122, + "grad_norm": 2.06784301882875, + "learning_rate": 7.42356250234001e-06, + "loss": 0.3345, + "step": 2558 + }, + { + "epoch": 0.35890603085553996, + "grad_norm": 2.629152283626657, + "learning_rate": 7.421575650319641e-06, + "loss": 0.3969, + "step": 2559 + }, + { + "epoch": 0.3590462833099579, + "grad_norm": 2.129968892438941, + "learning_rate": 7.419588298624054e-06, + "loss": 0.3352, + "step": 2560 + }, + { + "epoch": 0.35918653576437587, + "grad_norm": 2.109145653583698, + "learning_rate": 7.417600447663327e-06, + "loss": 0.3965, + "step": 2561 + }, + { + "epoch": 0.3593267882187938, + "grad_norm": 1.9931339783105941, + "learning_rate": 7.415612097847638e-06, + "loss": 0.3801, + "step": 2562 + }, + { + "epoch": 0.3594670406732118, + "grad_norm": 1.755883476592616, + "learning_rate": 7.4136232495872695e-06, + "loss": 0.3474, + "step": 2563 + }, + { + "epoch": 0.35960729312762973, + "grad_norm": 1.6746028144553087, + "learning_rate": 7.411633903292605e-06, + "loss": 0.3849, + "step": 2564 + }, + { + "epoch": 0.3597475455820477, + "grad_norm": 2.0947921209146725, + "learning_rate": 7.409644059374136e-06, + "loss": 0.424, + "step": 2565 + }, + { + "epoch": 0.35988779803646564, + "grad_norm": 2.283381889922185, + "learning_rate": 7.407653718242449e-06, + "loss": 0.3869, + "step": 2566 + }, + { + "epoch": 0.3600280504908836, + "grad_norm": 2.5098415259376528, + "learning_rate": 7.405662880308239e-06, + "loss": 0.4101, + "step": 2567 + }, + { + "epoch": 0.36016830294530155, + "grad_norm": 3.4601683440075672, + "learning_rate": 7.403671545982299e-06, + "loss": 0.3592, + "step": 2568 + }, + { + "epoch": 0.3603085553997195, + "grad_norm": 2.3954757644742712, + "learning_rate": 7.401679715675531e-06, + "loss": 0.3753, + "step": 2569 + }, + { + "epoch": 0.36044880785413747, + "grad_norm": 1.9883600905100367, + "learning_rate": 7.399687389798933e-06, + "loss": 0.3435, + "step": 2570 + }, + { + "epoch": 0.3605890603085554, + "grad_norm": 2.639416389836353, + "learning_rate": 7.397694568763607e-06, + "loss": 0.3902, + "step": 2571 + }, + { + "epoch": 0.3607293127629734, + "grad_norm": 2.083128906392836, + "learning_rate": 7.395701252980758e-06, + "loss": 0.3524, + "step": 2572 + }, + { + "epoch": 0.36086956521739133, + "grad_norm": 2.540861179311404, + "learning_rate": 7.393707442861693e-06, + "loss": 0.3644, + "step": 2573 + }, + { + "epoch": 0.36100981767180923, + "grad_norm": 2.5971928003812685, + "learning_rate": 7.391713138817822e-06, + "loss": 0.4045, + "step": 2574 + }, + { + "epoch": 0.3611500701262272, + "grad_norm": 2.586675348496697, + "learning_rate": 7.389718341260654e-06, + "loss": 0.388, + "step": 2575 + }, + { + "epoch": 0.36129032258064514, + "grad_norm": 2.08230629083009, + "learning_rate": 7.387723050601804e-06, + "loss": 0.372, + "step": 2576 + }, + { + "epoch": 0.3614305750350631, + "grad_norm": 3.3773648489249584, + "learning_rate": 7.385727267252983e-06, + "loss": 0.4274, + "step": 2577 + }, + { + "epoch": 0.36157082748948105, + "grad_norm": 3.4426538844955963, + "learning_rate": 7.383730991626007e-06, + "loss": 0.3648, + "step": 2578 + }, + { + "epoch": 0.361711079943899, + "grad_norm": 9.793128457762329, + "learning_rate": 7.381734224132796e-06, + "loss": 0.4114, + "step": 2579 + }, + { + "epoch": 0.36185133239831696, + "grad_norm": 2.4929754751432127, + "learning_rate": 7.379736965185369e-06, + "loss": 0.3874, + "step": 2580 + }, + { + "epoch": 0.3619915848527349, + "grad_norm": 2.882129288430603, + "learning_rate": 7.3777392151958435e-06, + "loss": 0.4169, + "step": 2581 + }, + { + "epoch": 0.36213183730715287, + "grad_norm": 2.492471517325515, + "learning_rate": 7.375740974576444e-06, + "loss": 0.4064, + "step": 2582 + }, + { + "epoch": 0.3622720897615708, + "grad_norm": 2.3357046770403977, + "learning_rate": 7.373742243739493e-06, + "loss": 0.385, + "step": 2583 + }, + { + "epoch": 0.3624123422159888, + "grad_norm": 2.216743744200852, + "learning_rate": 7.3717430230974155e-06, + "loss": 0.3975, + "step": 2584 + }, + { + "epoch": 0.36255259467040674, + "grad_norm": 2.695912346917678, + "learning_rate": 7.369743313062734e-06, + "loss": 0.4067, + "step": 2585 + }, + { + "epoch": 0.3626928471248247, + "grad_norm": 2.655723699152729, + "learning_rate": 7.367743114048076e-06, + "loss": 0.4192, + "step": 2586 + }, + { + "epoch": 0.36283309957924265, + "grad_norm": 3.0299871901439657, + "learning_rate": 7.365742426466169e-06, + "loss": 0.3713, + "step": 2587 + }, + { + "epoch": 0.3629733520336606, + "grad_norm": 2.450428932944969, + "learning_rate": 7.3637412507298415e-06, + "loss": 0.4427, + "step": 2588 + }, + { + "epoch": 0.36311360448807856, + "grad_norm": 1.8514180187398916, + "learning_rate": 7.361739587252019e-06, + "loss": 0.3735, + "step": 2589 + }, + { + "epoch": 0.3632538569424965, + "grad_norm": 2.3778943946302915, + "learning_rate": 7.359737436445735e-06, + "loss": 0.395, + "step": 2590 + }, + { + "epoch": 0.36339410939691447, + "grad_norm": 4.216947495428324, + "learning_rate": 7.3577347987241176e-06, + "loss": 0.3816, + "step": 2591 + }, + { + "epoch": 0.3635343618513324, + "grad_norm": 2.2805235241607327, + "learning_rate": 7.355731674500396e-06, + "loss": 0.3505, + "step": 2592 + }, + { + "epoch": 0.3636746143057504, + "grad_norm": 2.199853077230697, + "learning_rate": 7.353728064187901e-06, + "loss": 0.3395, + "step": 2593 + }, + { + "epoch": 0.3638148667601683, + "grad_norm": 1.9726486961295655, + "learning_rate": 7.3517239682000675e-06, + "loss": 0.3528, + "step": 2594 + }, + { + "epoch": 0.36395511921458623, + "grad_norm": 3.307952350296336, + "learning_rate": 7.349719386950422e-06, + "loss": 0.3582, + "step": 2595 + }, + { + "epoch": 0.3640953716690042, + "grad_norm": 2.3407667374318577, + "learning_rate": 7.347714320852597e-06, + "loss": 0.4029, + "step": 2596 + }, + { + "epoch": 0.36423562412342214, + "grad_norm": 2.985186401630228, + "learning_rate": 7.345708770320324e-06, + "loss": 0.3491, + "step": 2597 + }, + { + "epoch": 0.3643758765778401, + "grad_norm": 3.4139351199063066, + "learning_rate": 7.343702735767435e-06, + "loss": 0.3948, + "step": 2598 + }, + { + "epoch": 0.36451612903225805, + "grad_norm": 3.1926437462667474, + "learning_rate": 7.341696217607861e-06, + "loss": 0.3456, + "step": 2599 + }, + { + "epoch": 0.364656381486676, + "grad_norm": 3.9351693173423103, + "learning_rate": 7.339689216255632e-06, + "loss": 0.3537, + "step": 2600 + }, + { + "epoch": 0.36479663394109396, + "grad_norm": 2.074336219234299, + "learning_rate": 7.337681732124882e-06, + "loss": 0.3581, + "step": 2601 + }, + { + "epoch": 0.3649368863955119, + "grad_norm": 2.4942581041549854, + "learning_rate": 7.335673765629837e-06, + "loss": 0.3644, + "step": 2602 + }, + { + "epoch": 0.36507713884992987, + "grad_norm": 2.3353223724340326, + "learning_rate": 7.333665317184829e-06, + "loss": 0.4102, + "step": 2603 + }, + { + "epoch": 0.3652173913043478, + "grad_norm": 3.3822074507620368, + "learning_rate": 7.3316563872042865e-06, + "loss": 0.3734, + "step": 2604 + }, + { + "epoch": 0.3653576437587658, + "grad_norm": 2.590510171846359, + "learning_rate": 7.329646976102741e-06, + "loss": 0.3773, + "step": 2605 + }, + { + "epoch": 0.36549789621318374, + "grad_norm": 3.123350547354971, + "learning_rate": 7.327637084294818e-06, + "loss": 0.3985, + "step": 2606 + }, + { + "epoch": 0.3656381486676017, + "grad_norm": 2.710728354177903, + "learning_rate": 7.325626712195242e-06, + "loss": 0.4421, + "step": 2607 + }, + { + "epoch": 0.36577840112201965, + "grad_norm": 2.6896713711261757, + "learning_rate": 7.323615860218844e-06, + "loss": 0.3636, + "step": 2608 + }, + { + "epoch": 0.3659186535764376, + "grad_norm": 2.325462713650891, + "learning_rate": 7.321604528780546e-06, + "loss": 0.3891, + "step": 2609 + }, + { + "epoch": 0.36605890603085556, + "grad_norm": 2.5894860758564513, + "learning_rate": 7.319592718295374e-06, + "loss": 0.3386, + "step": 2610 + }, + { + "epoch": 0.3661991584852735, + "grad_norm": 2.0062626300544935, + "learning_rate": 7.317580429178452e-06, + "loss": 0.4057, + "step": 2611 + }, + { + "epoch": 0.36633941093969147, + "grad_norm": 2.5548971014335535, + "learning_rate": 7.315567661844999e-06, + "loss": 0.3867, + "step": 2612 + }, + { + "epoch": 0.3664796633941094, + "grad_norm": 2.2378452269120483, + "learning_rate": 7.313554416710337e-06, + "loss": 0.3825, + "step": 2613 + }, + { + "epoch": 0.3666199158485273, + "grad_norm": 3.1128247930358484, + "learning_rate": 7.311540694189885e-06, + "loss": 0.3922, + "step": 2614 + }, + { + "epoch": 0.3667601683029453, + "grad_norm": 2.6382339757965676, + "learning_rate": 7.30952649469916e-06, + "loss": 0.3823, + "step": 2615 + }, + { + "epoch": 0.36690042075736323, + "grad_norm": 1.9376780520426813, + "learning_rate": 7.307511818653778e-06, + "loss": 0.4007, + "step": 2616 + }, + { + "epoch": 0.3670406732117812, + "grad_norm": 2.0710307476831726, + "learning_rate": 7.305496666469456e-06, + "loss": 0.3688, + "step": 2617 + }, + { + "epoch": 0.36718092566619914, + "grad_norm": 2.6979782411674917, + "learning_rate": 7.3034810385620035e-06, + "loss": 0.4235, + "step": 2618 + }, + { + "epoch": 0.3673211781206171, + "grad_norm": 2.3795665206764998, + "learning_rate": 7.301464935347331e-06, + "loss": 0.359, + "step": 2619 + }, + { + "epoch": 0.36746143057503505, + "grad_norm": 2.0266195484799328, + "learning_rate": 7.299448357241448e-06, + "loss": 0.3995, + "step": 2620 + }, + { + "epoch": 0.367601683029453, + "grad_norm": 2.2305254018774163, + "learning_rate": 7.297431304660464e-06, + "loss": 0.3484, + "step": 2621 + }, + { + "epoch": 0.36774193548387096, + "grad_norm": 2.4227204846647012, + "learning_rate": 7.295413778020579e-06, + "loss": 0.4161, + "step": 2622 + }, + { + "epoch": 0.3678821879382889, + "grad_norm": 2.91374405253559, + "learning_rate": 7.293395777738099e-06, + "loss": 0.3287, + "step": 2623 + }, + { + "epoch": 0.3680224403927069, + "grad_norm": 2.9312154218834388, + "learning_rate": 7.291377304229423e-06, + "loss": 0.4088, + "step": 2624 + }, + { + "epoch": 0.36816269284712483, + "grad_norm": 2.932389216497157, + "learning_rate": 7.28935835791105e-06, + "loss": 0.3809, + "step": 2625 + }, + { + "epoch": 0.3683029453015428, + "grad_norm": 2.310382388943893, + "learning_rate": 7.287338939199574e-06, + "loss": 0.3857, + "step": 2626 + }, + { + "epoch": 0.36844319775596074, + "grad_norm": 2.1110186919121627, + "learning_rate": 7.28531904851169e-06, + "loss": 0.3772, + "step": 2627 + }, + { + "epoch": 0.3685834502103787, + "grad_norm": 2.818313947220294, + "learning_rate": 7.283298686264184e-06, + "loss": 0.405, + "step": 2628 + }, + { + "epoch": 0.36872370266479665, + "grad_norm": 2.3937089762147945, + "learning_rate": 7.281277852873947e-06, + "loss": 0.4092, + "step": 2629 + }, + { + "epoch": 0.3688639551192146, + "grad_norm": 2.0548031850527626, + "learning_rate": 7.279256548757964e-06, + "loss": 0.3966, + "step": 2630 + }, + { + "epoch": 0.36900420757363256, + "grad_norm": 2.1457592726090104, + "learning_rate": 7.277234774333317e-06, + "loss": 0.3443, + "step": 2631 + }, + { + "epoch": 0.3691444600280505, + "grad_norm": 1.9191386894652678, + "learning_rate": 7.2752125300171835e-06, + "loss": 0.3674, + "step": 2632 + }, + { + "epoch": 0.36928471248246847, + "grad_norm": 2.949105731743844, + "learning_rate": 7.27318981622684e-06, + "loss": 0.3433, + "step": 2633 + }, + { + "epoch": 0.36942496493688637, + "grad_norm": 3.0184232546806573, + "learning_rate": 7.271166633379661e-06, + "loss": 0.3645, + "step": 2634 + }, + { + "epoch": 0.3695652173913043, + "grad_norm": 2.2647388698457243, + "learning_rate": 7.269142981893114e-06, + "loss": 0.4023, + "step": 2635 + }, + { + "epoch": 0.3697054698457223, + "grad_norm": 1.8647964682496092, + "learning_rate": 7.267118862184767e-06, + "loss": 0.3317, + "step": 2636 + }, + { + "epoch": 0.36984572230014023, + "grad_norm": 2.357368474954432, + "learning_rate": 7.265094274672282e-06, + "loss": 0.389, + "step": 2637 + }, + { + "epoch": 0.3699859747545582, + "grad_norm": 6.685407649916095, + "learning_rate": 7.263069219773417e-06, + "loss": 0.3664, + "step": 2638 + }, + { + "epoch": 0.37012622720897614, + "grad_norm": 2.0653320289412322, + "learning_rate": 7.26104369790603e-06, + "loss": 0.3368, + "step": 2639 + }, + { + "epoch": 0.3702664796633941, + "grad_norm": 1.951410991080459, + "learning_rate": 7.259017709488073e-06, + "loss": 0.3912, + "step": 2640 + }, + { + "epoch": 0.37040673211781205, + "grad_norm": 2.4032127731425708, + "learning_rate": 7.256991254937595e-06, + "loss": 0.4393, + "step": 2641 + }, + { + "epoch": 0.37054698457223, + "grad_norm": 2.773227307588479, + "learning_rate": 7.25496433467274e-06, + "loss": 0.3652, + "step": 2642 + }, + { + "epoch": 0.37068723702664796, + "grad_norm": 2.0992767496334337, + "learning_rate": 7.252936949111749e-06, + "loss": 0.3895, + "step": 2643 + }, + { + "epoch": 0.3708274894810659, + "grad_norm": 1.8346186019599804, + "learning_rate": 7.250909098672958e-06, + "loss": 0.3879, + "step": 2644 + }, + { + "epoch": 0.3709677419354839, + "grad_norm": 2.9875457057888246, + "learning_rate": 7.248880783774801e-06, + "loss": 0.3751, + "step": 2645 + }, + { + "epoch": 0.37110799438990183, + "grad_norm": 2.4846658596381754, + "learning_rate": 7.246852004835807e-06, + "loss": 0.4625, + "step": 2646 + }, + { + "epoch": 0.3712482468443198, + "grad_norm": 2.5329693265037356, + "learning_rate": 7.2448227622746e-06, + "loss": 0.3595, + "step": 2647 + }, + { + "epoch": 0.37138849929873774, + "grad_norm": 1.9896371710984335, + "learning_rate": 7.242793056509898e-06, + "loss": 0.3655, + "step": 2648 + }, + { + "epoch": 0.3715287517531557, + "grad_norm": 2.65739981504629, + "learning_rate": 7.240762887960518e-06, + "loss": 0.3679, + "step": 2649 + }, + { + "epoch": 0.37166900420757365, + "grad_norm": 2.3821040867084986, + "learning_rate": 7.2387322570453724e-06, + "loss": 0.3723, + "step": 2650 + }, + { + "epoch": 0.3718092566619916, + "grad_norm": 1.8763319370948819, + "learning_rate": 7.236701164183466e-06, + "loss": 0.394, + "step": 2651 + }, + { + "epoch": 0.37194950911640956, + "grad_norm": 3.2589482584916687, + "learning_rate": 7.2346696097939025e-06, + "loss": 0.3896, + "step": 2652 + }, + { + "epoch": 0.3720897615708275, + "grad_norm": 2.9309941747497543, + "learning_rate": 7.232637594295876e-06, + "loss": 0.3974, + "step": 2653 + }, + { + "epoch": 0.3722300140252454, + "grad_norm": 1.9401587425253404, + "learning_rate": 7.23060511810868e-06, + "loss": 0.3489, + "step": 2654 + }, + { + "epoch": 0.37237026647966337, + "grad_norm": 3.813944558551307, + "learning_rate": 7.228572181651703e-06, + "loss": 0.4014, + "step": 2655 + }, + { + "epoch": 0.3725105189340813, + "grad_norm": 2.1585601373265186, + "learning_rate": 7.226538785344427e-06, + "loss": 0.4384, + "step": 2656 + }, + { + "epoch": 0.3726507713884993, + "grad_norm": 1.8833090099800784, + "learning_rate": 7.224504929606429e-06, + "loss": 0.3416, + "step": 2657 + }, + { + "epoch": 0.37279102384291724, + "grad_norm": 2.566109232212122, + "learning_rate": 7.22247061485738e-06, + "loss": 0.4173, + "step": 2658 + }, + { + "epoch": 0.3729312762973352, + "grad_norm": 2.4010808338824003, + "learning_rate": 7.220435841517045e-06, + "loss": 0.3552, + "step": 2659 + }, + { + "epoch": 0.37307152875175315, + "grad_norm": 2.3016393126847223, + "learning_rate": 7.2184006100052885e-06, + "loss": 0.3524, + "step": 2660 + }, + { + "epoch": 0.3732117812061711, + "grad_norm": 1.9403418342042806, + "learning_rate": 7.216364920742065e-06, + "loss": 0.3399, + "step": 2661 + }, + { + "epoch": 0.37335203366058906, + "grad_norm": 2.297717089270709, + "learning_rate": 7.214328774147425e-06, + "loss": 0.4446, + "step": 2662 + }, + { + "epoch": 0.373492286115007, + "grad_norm": 2.0321738856324605, + "learning_rate": 7.212292170641514e-06, + "loss": 0.3556, + "step": 2663 + }, + { + "epoch": 0.37363253856942497, + "grad_norm": 2.1208648839176343, + "learning_rate": 7.210255110644569e-06, + "loss": 0.388, + "step": 2664 + }, + { + "epoch": 0.3737727910238429, + "grad_norm": 2.441592863961637, + "learning_rate": 7.2082175945769226e-06, + "loss": 0.3787, + "step": 2665 + }, + { + "epoch": 0.3739130434782609, + "grad_norm": 3.269145619245797, + "learning_rate": 7.206179622859005e-06, + "loss": 0.4324, + "step": 2666 + }, + { + "epoch": 0.37405329593267883, + "grad_norm": 1.7342886688465817, + "learning_rate": 7.204141195911336e-06, + "loss": 0.3633, + "step": 2667 + }, + { + "epoch": 0.3741935483870968, + "grad_norm": 2.247254604058616, + "learning_rate": 7.202102314154531e-06, + "loss": 0.4224, + "step": 2668 + }, + { + "epoch": 0.37433380084151474, + "grad_norm": 2.998866343878918, + "learning_rate": 7.200062978009297e-06, + "loss": 0.4204, + "step": 2669 + }, + { + "epoch": 0.3744740532959327, + "grad_norm": 1.95165465895523, + "learning_rate": 7.198023187896439e-06, + "loss": 0.3802, + "step": 2670 + }, + { + "epoch": 0.37461430575035065, + "grad_norm": 2.3514281579864673, + "learning_rate": 7.195982944236853e-06, + "loss": 0.389, + "step": 2671 + }, + { + "epoch": 0.3747545582047686, + "grad_norm": 2.1633194842529586, + "learning_rate": 7.193942247451528e-06, + "loss": 0.3704, + "step": 2672 + }, + { + "epoch": 0.37489481065918656, + "grad_norm": 3.6690834634706886, + "learning_rate": 7.191901097961549e-06, + "loss": 0.3073, + "step": 2673 + }, + { + "epoch": 0.37503506311360446, + "grad_norm": 1.7635468262268166, + "learning_rate": 7.189859496188092e-06, + "loss": 0.3648, + "step": 2674 + }, + { + "epoch": 0.3751753155680224, + "grad_norm": 2.009354292553742, + "learning_rate": 7.187817442552427e-06, + "loss": 0.3689, + "step": 2675 + }, + { + "epoch": 0.37531556802244037, + "grad_norm": 2.02493042531996, + "learning_rate": 7.185774937475919e-06, + "loss": 0.3848, + "step": 2676 + }, + { + "epoch": 0.3754558204768583, + "grad_norm": 1.9914280659381172, + "learning_rate": 7.183731981380024e-06, + "loss": 0.3969, + "step": 2677 + }, + { + "epoch": 0.3755960729312763, + "grad_norm": 1.9733611707506664, + "learning_rate": 7.181688574686292e-06, + "loss": 0.3992, + "step": 2678 + }, + { + "epoch": 0.37573632538569424, + "grad_norm": 1.963039107898463, + "learning_rate": 7.179644717816363e-06, + "loss": 0.4142, + "step": 2679 + }, + { + "epoch": 0.3758765778401122, + "grad_norm": 1.9738013239463472, + "learning_rate": 7.177600411191976e-06, + "loss": 0.3233, + "step": 2680 + }, + { + "epoch": 0.37601683029453015, + "grad_norm": 1.6404519852736195, + "learning_rate": 7.175555655234958e-06, + "loss": 0.359, + "step": 2681 + }, + { + "epoch": 0.3761570827489481, + "grad_norm": 2.042675923646879, + "learning_rate": 7.173510450367229e-06, + "loss": 0.414, + "step": 2682 + }, + { + "epoch": 0.37629733520336606, + "grad_norm": 2.8844720462496602, + "learning_rate": 7.1714647970108056e-06, + "loss": 0.365, + "step": 2683 + }, + { + "epoch": 0.376437587657784, + "grad_norm": 1.771170759777421, + "learning_rate": 7.169418695587791e-06, + "loss": 0.3971, + "step": 2684 + }, + { + "epoch": 0.37657784011220197, + "grad_norm": 2.1165920419629356, + "learning_rate": 7.167372146520386e-06, + "loss": 0.3781, + "step": 2685 + }, + { + "epoch": 0.3767180925666199, + "grad_norm": 2.7971823752514746, + "learning_rate": 7.165325150230881e-06, + "loss": 0.3686, + "step": 2686 + }, + { + "epoch": 0.3768583450210379, + "grad_norm": 2.1217444670447616, + "learning_rate": 7.1632777071416606e-06, + "loss": 0.4086, + "step": 2687 + }, + { + "epoch": 0.37699859747545583, + "grad_norm": 2.0857937081028006, + "learning_rate": 7.161229817675198e-06, + "loss": 0.3901, + "step": 2688 + }, + { + "epoch": 0.3771388499298738, + "grad_norm": 2.210010347644177, + "learning_rate": 7.159181482254062e-06, + "loss": 0.3787, + "step": 2689 + }, + { + "epoch": 0.37727910238429174, + "grad_norm": 1.6996225863736485, + "learning_rate": 7.157132701300911e-06, + "loss": 0.3688, + "step": 2690 + }, + { + "epoch": 0.3774193548387097, + "grad_norm": 2.6115570082178237, + "learning_rate": 7.1550834752385e-06, + "loss": 0.3662, + "step": 2691 + }, + { + "epoch": 0.37755960729312765, + "grad_norm": 2.7297591131781216, + "learning_rate": 7.15303380448967e-06, + "loss": 0.4395, + "step": 2692 + }, + { + "epoch": 0.3776998597475456, + "grad_norm": 2.5221248082279373, + "learning_rate": 7.150983689477357e-06, + "loss": 0.3924, + "step": 2693 + }, + { + "epoch": 0.3778401122019635, + "grad_norm": 1.6853880205144463, + "learning_rate": 7.148933130624587e-06, + "loss": 0.3585, + "step": 2694 + }, + { + "epoch": 0.37798036465638146, + "grad_norm": 1.8355528083505668, + "learning_rate": 7.146882128354479e-06, + "loss": 0.3716, + "step": 2695 + }, + { + "epoch": 0.3781206171107994, + "grad_norm": 2.4638459956542564, + "learning_rate": 7.144830683090242e-06, + "loss": 0.3877, + "step": 2696 + }, + { + "epoch": 0.3782608695652174, + "grad_norm": 2.586682353545663, + "learning_rate": 7.14277879525518e-06, + "loss": 0.3457, + "step": 2697 + }, + { + "epoch": 0.3784011220196353, + "grad_norm": 2.0312158435003806, + "learning_rate": 7.140726465272686e-06, + "loss": 0.3851, + "step": 2698 + }, + { + "epoch": 0.3785413744740533, + "grad_norm": 1.9745050510275235, + "learning_rate": 7.138673693566241e-06, + "loss": 0.384, + "step": 2699 + }, + { + "epoch": 0.37868162692847124, + "grad_norm": 2.9102868377265216, + "learning_rate": 7.1366204805594205e-06, + "loss": 0.3597, + "step": 2700 + }, + { + "epoch": 0.3788218793828892, + "grad_norm": 2.523986095402258, + "learning_rate": 7.134566826675892e-06, + "loss": 0.3666, + "step": 2701 + }, + { + "epoch": 0.37896213183730715, + "grad_norm": 2.0368041178287855, + "learning_rate": 7.13251273233941e-06, + "loss": 0.3697, + "step": 2702 + }, + { + "epoch": 0.3791023842917251, + "grad_norm": 2.182700313888236, + "learning_rate": 7.130458197973828e-06, + "loss": 0.4175, + "step": 2703 + }, + { + "epoch": 0.37924263674614306, + "grad_norm": 1.7856352516123042, + "learning_rate": 7.12840322400308e-06, + "loss": 0.3761, + "step": 2704 + }, + { + "epoch": 0.379382889200561, + "grad_norm": 1.8700717372699933, + "learning_rate": 7.1263478108511955e-06, + "loss": 0.3478, + "step": 2705 + }, + { + "epoch": 0.37952314165497897, + "grad_norm": 2.146491039428917, + "learning_rate": 7.1242919589422974e-06, + "loss": 0.3666, + "step": 2706 + }, + { + "epoch": 0.3796633941093969, + "grad_norm": 1.758262813955884, + "learning_rate": 7.122235668700594e-06, + "loss": 0.365, + "step": 2707 + }, + { + "epoch": 0.3798036465638149, + "grad_norm": 1.6914870879345465, + "learning_rate": 7.12017894055039e-06, + "loss": 0.3607, + "step": 2708 + }, + { + "epoch": 0.37994389901823283, + "grad_norm": 1.8810810659604993, + "learning_rate": 7.118121774916074e-06, + "loss": 0.3916, + "step": 2709 + }, + { + "epoch": 0.3800841514726508, + "grad_norm": 2.002667317347584, + "learning_rate": 7.1160641722221255e-06, + "loss": 0.3769, + "step": 2710 + }, + { + "epoch": 0.38022440392706874, + "grad_norm": 2.429506939130892, + "learning_rate": 7.114006132893121e-06, + "loss": 0.3904, + "step": 2711 + }, + { + "epoch": 0.3803646563814867, + "grad_norm": 2.7939574299042547, + "learning_rate": 7.111947657353719e-06, + "loss": 0.3409, + "step": 2712 + }, + { + "epoch": 0.38050490883590465, + "grad_norm": 1.79968267928662, + "learning_rate": 7.1098887460286745e-06, + "loss": 0.3828, + "step": 2713 + }, + { + "epoch": 0.38064516129032255, + "grad_norm": 2.3324422042898596, + "learning_rate": 7.1078293993428285e-06, + "loss": 0.3642, + "step": 2714 + }, + { + "epoch": 0.3807854137447405, + "grad_norm": 1.6515685804069273, + "learning_rate": 7.105769617721111e-06, + "loss": 0.3708, + "step": 2715 + }, + { + "epoch": 0.38092566619915846, + "grad_norm": 1.7811308034534186, + "learning_rate": 7.1037094015885456e-06, + "loss": 0.4029, + "step": 2716 + }, + { + "epoch": 0.3810659186535764, + "grad_norm": 2.375109218294629, + "learning_rate": 7.101648751370243e-06, + "loss": 0.3916, + "step": 2717 + }, + { + "epoch": 0.3812061711079944, + "grad_norm": 2.1253549615931346, + "learning_rate": 7.099587667491404e-06, + "loss": 0.3755, + "step": 2718 + }, + { + "epoch": 0.38134642356241233, + "grad_norm": 2.238059467292633, + "learning_rate": 7.097526150377319e-06, + "loss": 0.4146, + "step": 2719 + }, + { + "epoch": 0.3814866760168303, + "grad_norm": 1.608583199983806, + "learning_rate": 7.095464200453366e-06, + "loss": 0.3084, + "step": 2720 + }, + { + "epoch": 0.38162692847124824, + "grad_norm": 2.698270109337977, + "learning_rate": 7.093401818145016e-06, + "loss": 0.3955, + "step": 2721 + }, + { + "epoch": 0.3817671809256662, + "grad_norm": 2.173661958250716, + "learning_rate": 7.091339003877826e-06, + "loss": 0.3652, + "step": 2722 + }, + { + "epoch": 0.38190743338008415, + "grad_norm": 2.213002564847195, + "learning_rate": 7.0892757580774455e-06, + "loss": 0.3753, + "step": 2723 + }, + { + "epoch": 0.3820476858345021, + "grad_norm": 2.08170314586838, + "learning_rate": 7.087212081169608e-06, + "loss": 0.3946, + "step": 2724 + }, + { + "epoch": 0.38218793828892006, + "grad_norm": 1.8079209858119716, + "learning_rate": 7.08514797358014e-06, + "loss": 0.4261, + "step": 2725 + }, + { + "epoch": 0.382328190743338, + "grad_norm": 2.3784720489675144, + "learning_rate": 7.083083435734955e-06, + "loss": 0.3421, + "step": 2726 + }, + { + "epoch": 0.38246844319775597, + "grad_norm": 2.560923673023371, + "learning_rate": 7.081018468060057e-06, + "loss": 0.4263, + "step": 2727 + }, + { + "epoch": 0.3826086956521739, + "grad_norm": 1.9860750860520162, + "learning_rate": 7.078953070981538e-06, + "loss": 0.4225, + "step": 2728 + }, + { + "epoch": 0.3827489481065919, + "grad_norm": 2.022692215394565, + "learning_rate": 7.0768872449255765e-06, + "loss": 0.3675, + "step": 2729 + }, + { + "epoch": 0.38288920056100983, + "grad_norm": 2.4224701180991883, + "learning_rate": 7.074820990318444e-06, + "loss": 0.3681, + "step": 2730 + }, + { + "epoch": 0.3830294530154278, + "grad_norm": 2.240185205773074, + "learning_rate": 7.072754307586495e-06, + "loss": 0.387, + "step": 2731 + }, + { + "epoch": 0.38316970546984574, + "grad_norm": 2.001635717025117, + "learning_rate": 7.070687197156175e-06, + "loss": 0.35, + "step": 2732 + }, + { + "epoch": 0.3833099579242637, + "grad_norm": 9.34241325487182, + "learning_rate": 7.068619659454019e-06, + "loss": 0.3894, + "step": 2733 + }, + { + "epoch": 0.3834502103786816, + "grad_norm": 2.30715344935689, + "learning_rate": 7.066551694906651e-06, + "loss": 0.3435, + "step": 2734 + }, + { + "epoch": 0.38359046283309955, + "grad_norm": 1.8624227213348574, + "learning_rate": 7.064483303940777e-06, + "loss": 0.3816, + "step": 2735 + }, + { + "epoch": 0.3837307152875175, + "grad_norm": 2.114541853574622, + "learning_rate": 7.062414486983197e-06, + "loss": 0.3732, + "step": 2736 + }, + { + "epoch": 0.38387096774193546, + "grad_norm": 1.8986329029131497, + "learning_rate": 7.060345244460797e-06, + "loss": 0.3693, + "step": 2737 + }, + { + "epoch": 0.3840112201963534, + "grad_norm": 2.026134917034492, + "learning_rate": 7.05827557680055e-06, + "loss": 0.3971, + "step": 2738 + }, + { + "epoch": 0.3841514726507714, + "grad_norm": 2.19224534995001, + "learning_rate": 7.056205484429519e-06, + "loss": 0.3927, + "step": 2739 + }, + { + "epoch": 0.38429172510518933, + "grad_norm": 2.1723162525288475, + "learning_rate": 7.0541349677748524e-06, + "loss": 0.4036, + "step": 2740 + }, + { + "epoch": 0.3844319775596073, + "grad_norm": 1.6751322167438003, + "learning_rate": 7.052064027263785e-06, + "loss": 0.3715, + "step": 2741 + }, + { + "epoch": 0.38457223001402524, + "grad_norm": 1.831945356295364, + "learning_rate": 7.049992663323642e-06, + "loss": 0.3584, + "step": 2742 + }, + { + "epoch": 0.3847124824684432, + "grad_norm": 1.8147570927228163, + "learning_rate": 7.047920876381837e-06, + "loss": 0.3601, + "step": 2743 + }, + { + "epoch": 0.38485273492286115, + "grad_norm": 2.0593086752970784, + "learning_rate": 7.045848666865867e-06, + "loss": 0.3974, + "step": 2744 + }, + { + "epoch": 0.3849929873772791, + "grad_norm": 4.138751603025116, + "learning_rate": 7.043776035203318e-06, + "loss": 0.3822, + "step": 2745 + }, + { + "epoch": 0.38513323983169706, + "grad_norm": 5.317272062523193, + "learning_rate": 7.041702981821862e-06, + "loss": 0.4017, + "step": 2746 + }, + { + "epoch": 0.385273492286115, + "grad_norm": 2.031893584190763, + "learning_rate": 7.039629507149261e-06, + "loss": 0.3947, + "step": 2747 + }, + { + "epoch": 0.38541374474053297, + "grad_norm": 1.930081778953072, + "learning_rate": 7.0375556116133605e-06, + "loss": 0.3834, + "step": 2748 + }, + { + "epoch": 0.3855539971949509, + "grad_norm": 1.8266058584635771, + "learning_rate": 7.035481295642096e-06, + "loss": 0.3396, + "step": 2749 + }, + { + "epoch": 0.3856942496493689, + "grad_norm": 2.187581524011011, + "learning_rate": 7.033406559663486e-06, + "loss": 0.3678, + "step": 2750 + }, + { + "epoch": 0.38583450210378684, + "grad_norm": 2.3217121873251054, + "learning_rate": 7.03133140410564e-06, + "loss": 0.3878, + "step": 2751 + }, + { + "epoch": 0.3859747545582048, + "grad_norm": 3.0723880808895685, + "learning_rate": 7.029255829396751e-06, + "loss": 0.376, + "step": 2752 + }, + { + "epoch": 0.38611500701262275, + "grad_norm": 1.9281320828349446, + "learning_rate": 7.027179835965097e-06, + "loss": 0.3896, + "step": 2753 + }, + { + "epoch": 0.38625525946704065, + "grad_norm": 2.8745669163127623, + "learning_rate": 7.025103424239049e-06, + "loss": 0.3623, + "step": 2754 + }, + { + "epoch": 0.3863955119214586, + "grad_norm": 2.3002719801267313, + "learning_rate": 7.023026594647057e-06, + "loss": 0.3546, + "step": 2755 + }, + { + "epoch": 0.38653576437587656, + "grad_norm": 2.0389703605559317, + "learning_rate": 7.02094934761766e-06, + "loss": 0.3723, + "step": 2756 + }, + { + "epoch": 0.3866760168302945, + "grad_norm": 2.9409158204089985, + "learning_rate": 7.018871683579487e-06, + "loss": 0.382, + "step": 2757 + }, + { + "epoch": 0.38681626928471247, + "grad_norm": 2.0436059603547623, + "learning_rate": 7.016793602961245e-06, + "loss": 0.3681, + "step": 2758 + }, + { + "epoch": 0.3869565217391304, + "grad_norm": 1.7090881664910806, + "learning_rate": 7.0147151061917355e-06, + "loss": 0.3647, + "step": 2759 + }, + { + "epoch": 0.3870967741935484, + "grad_norm": 2.3634208701009043, + "learning_rate": 7.012636193699838e-06, + "loss": 0.3354, + "step": 2760 + }, + { + "epoch": 0.38723702664796633, + "grad_norm": 2.9577348847037253, + "learning_rate": 7.010556865914525e-06, + "loss": 0.3759, + "step": 2761 + }, + { + "epoch": 0.3873772791023843, + "grad_norm": 2.0398032059835147, + "learning_rate": 7.008477123264849e-06, + "loss": 0.402, + "step": 2762 + }, + { + "epoch": 0.38751753155680224, + "grad_norm": 2.8535680955348677, + "learning_rate": 7.006396966179949e-06, + "loss": 0.3694, + "step": 2763 + }, + { + "epoch": 0.3876577840112202, + "grad_norm": 2.3300804784080213, + "learning_rate": 7.004316395089055e-06, + "loss": 0.4104, + "step": 2764 + }, + { + "epoch": 0.38779803646563815, + "grad_norm": 2.3251774709915036, + "learning_rate": 7.002235410421476e-06, + "loss": 0.3872, + "step": 2765 + }, + { + "epoch": 0.3879382889200561, + "grad_norm": 2.190882982417195, + "learning_rate": 7.000154012606608e-06, + "loss": 0.3669, + "step": 2766 + }, + { + "epoch": 0.38807854137447406, + "grad_norm": 1.7113138420729719, + "learning_rate": 6.998072202073933e-06, + "loss": 0.3904, + "step": 2767 + }, + { + "epoch": 0.388218793828892, + "grad_norm": 2.481627570570841, + "learning_rate": 6.9959899792530195e-06, + "loss": 0.3392, + "step": 2768 + }, + { + "epoch": 0.38835904628330997, + "grad_norm": 2.669436943033956, + "learning_rate": 6.9939073445735205e-06, + "loss": 0.3725, + "step": 2769 + }, + { + "epoch": 0.3884992987377279, + "grad_norm": 1.7911573913443863, + "learning_rate": 6.99182429846517e-06, + "loss": 0.3495, + "step": 2770 + }, + { + "epoch": 0.3886395511921459, + "grad_norm": 1.911090021916523, + "learning_rate": 6.9897408413577905e-06, + "loss": 0.3862, + "step": 2771 + }, + { + "epoch": 0.38877980364656384, + "grad_norm": 2.2005145251542966, + "learning_rate": 6.987656973681291e-06, + "loss": 0.3553, + "step": 2772 + }, + { + "epoch": 0.3889200561009818, + "grad_norm": 2.650891812529021, + "learning_rate": 6.985572695865662e-06, + "loss": 0.3659, + "step": 2773 + }, + { + "epoch": 0.3890603085553997, + "grad_norm": 1.8968112714893508, + "learning_rate": 6.98348800834098e-06, + "loss": 0.3633, + "step": 2774 + }, + { + "epoch": 0.38920056100981765, + "grad_norm": 2.1181284935349205, + "learning_rate": 6.981402911537405e-06, + "loss": 0.3675, + "step": 2775 + }, + { + "epoch": 0.3893408134642356, + "grad_norm": 2.311301565219156, + "learning_rate": 6.9793174058851805e-06, + "loss": 0.334, + "step": 2776 + }, + { + "epoch": 0.38948106591865356, + "grad_norm": 3.1931824241398186, + "learning_rate": 6.97723149181464e-06, + "loss": 0.3521, + "step": 2777 + }, + { + "epoch": 0.3896213183730715, + "grad_norm": 1.7317256453123064, + "learning_rate": 6.975145169756193e-06, + "loss": 0.3607, + "step": 2778 + }, + { + "epoch": 0.38976157082748947, + "grad_norm": 5.834295938177826, + "learning_rate": 6.973058440140341e-06, + "loss": 0.3662, + "step": 2779 + }, + { + "epoch": 0.3899018232819074, + "grad_norm": 1.9658092758088719, + "learning_rate": 6.9709713033976655e-06, + "loss": 0.3617, + "step": 2780 + }, + { + "epoch": 0.3900420757363254, + "grad_norm": 2.005914882348768, + "learning_rate": 6.968883759958831e-06, + "loss": 0.4063, + "step": 2781 + }, + { + "epoch": 0.39018232819074333, + "grad_norm": 2.3280984761820163, + "learning_rate": 6.96679581025459e-06, + "loss": 0.3613, + "step": 2782 + }, + { + "epoch": 0.3903225806451613, + "grad_norm": 3.4377659087937547, + "learning_rate": 6.964707454715772e-06, + "loss": 0.411, + "step": 2783 + }, + { + "epoch": 0.39046283309957924, + "grad_norm": 2.175526465354706, + "learning_rate": 6.962618693773299e-06, + "loss": 0.3531, + "step": 2784 + }, + { + "epoch": 0.3906030855539972, + "grad_norm": 1.9304835727667538, + "learning_rate": 6.960529527858171e-06, + "loss": 0.3156, + "step": 2785 + }, + { + "epoch": 0.39074333800841515, + "grad_norm": 2.430980080965851, + "learning_rate": 6.958439957401471e-06, + "loss": 0.33, + "step": 2786 + }, + { + "epoch": 0.3908835904628331, + "grad_norm": 2.2167235240480196, + "learning_rate": 6.956349982834367e-06, + "loss": 0.4087, + "step": 2787 + }, + { + "epoch": 0.39102384291725106, + "grad_norm": 2.3529689296319822, + "learning_rate": 6.954259604588114e-06, + "loss": 0.3814, + "step": 2788 + }, + { + "epoch": 0.391164095371669, + "grad_norm": 1.9722438267821434, + "learning_rate": 6.9521688230940454e-06, + "loss": 0.3858, + "step": 2789 + }, + { + "epoch": 0.391304347826087, + "grad_norm": 2.0846286053201757, + "learning_rate": 6.9500776387835785e-06, + "loss": 0.3906, + "step": 2790 + }, + { + "epoch": 0.39144460028050493, + "grad_norm": 2.0932108626206456, + "learning_rate": 6.947986052088216e-06, + "loss": 0.3638, + "step": 2791 + }, + { + "epoch": 0.3915848527349229, + "grad_norm": 2.1413487234255335, + "learning_rate": 6.945894063439542e-06, + "loss": 0.3969, + "step": 2792 + }, + { + "epoch": 0.39172510518934084, + "grad_norm": 2.564965062637792, + "learning_rate": 6.943801673269222e-06, + "loss": 0.3809, + "step": 2793 + }, + { + "epoch": 0.39186535764375874, + "grad_norm": 1.8310054680884822, + "learning_rate": 6.941708882009006e-06, + "loss": 0.3528, + "step": 2794 + }, + { + "epoch": 0.3920056100981767, + "grad_norm": 2.191259369033134, + "learning_rate": 6.9396156900907295e-06, + "loss": 0.3281, + "step": 2795 + }, + { + "epoch": 0.39214586255259465, + "grad_norm": 1.956944446580266, + "learning_rate": 6.937522097946306e-06, + "loss": 0.3342, + "step": 2796 + }, + { + "epoch": 0.3922861150070126, + "grad_norm": 1.7812798480829113, + "learning_rate": 6.935428106007734e-06, + "loss": 0.4142, + "step": 2797 + }, + { + "epoch": 0.39242636746143056, + "grad_norm": 1.8440234094693648, + "learning_rate": 6.933333714707094e-06, + "loss": 0.3337, + "step": 2798 + }, + { + "epoch": 0.3925666199158485, + "grad_norm": 2.0130442576425236, + "learning_rate": 6.931238924476551e-06, + "loss": 0.3911, + "step": 2799 + }, + { + "epoch": 0.39270687237026647, + "grad_norm": 1.7551259173354243, + "learning_rate": 6.929143735748348e-06, + "loss": 0.3983, + "step": 2800 + }, + { + "epoch": 0.3928471248246844, + "grad_norm": 1.8954054241248117, + "learning_rate": 6.9270481489548125e-06, + "loss": 0.4167, + "step": 2801 + }, + { + "epoch": 0.3929873772791024, + "grad_norm": 2.2257547619985187, + "learning_rate": 6.924952164528355e-06, + "loss": 0.3829, + "step": 2802 + }, + { + "epoch": 0.39312762973352033, + "grad_norm": 2.3009327448399826, + "learning_rate": 6.922855782901468e-06, + "loss": 0.3772, + "step": 2803 + }, + { + "epoch": 0.3932678821879383, + "grad_norm": 2.2510698741844273, + "learning_rate": 6.920759004506723e-06, + "loss": 0.3412, + "step": 2804 + }, + { + "epoch": 0.39340813464235624, + "grad_norm": 2.3434082799387723, + "learning_rate": 6.918661829776778e-06, + "loss": 0.4037, + "step": 2805 + }, + { + "epoch": 0.3935483870967742, + "grad_norm": 1.8285699286625023, + "learning_rate": 6.916564259144369e-06, + "loss": 0.4049, + "step": 2806 + }, + { + "epoch": 0.39368863955119215, + "grad_norm": 1.9366074013409922, + "learning_rate": 6.9144662930423144e-06, + "loss": 0.4165, + "step": 2807 + }, + { + "epoch": 0.3938288920056101, + "grad_norm": 2.0170469960537734, + "learning_rate": 6.912367931903516e-06, + "loss": 0.3661, + "step": 2808 + }, + { + "epoch": 0.39396914446002806, + "grad_norm": 3.068442049132869, + "learning_rate": 6.910269176160957e-06, + "loss": 0.3858, + "step": 2809 + }, + { + "epoch": 0.394109396914446, + "grad_norm": 2.0466238552093334, + "learning_rate": 6.9081700262477e-06, + "loss": 0.3652, + "step": 2810 + }, + { + "epoch": 0.394249649368864, + "grad_norm": 2.240531815169965, + "learning_rate": 6.906070482596887e-06, + "loss": 0.3786, + "step": 2811 + }, + { + "epoch": 0.39438990182328193, + "grad_norm": 2.1921137519335114, + "learning_rate": 6.903970545641749e-06, + "loss": 0.3733, + "step": 2812 + }, + { + "epoch": 0.3945301542776999, + "grad_norm": 2.9701212075415024, + "learning_rate": 6.901870215815591e-06, + "loss": 0.3977, + "step": 2813 + }, + { + "epoch": 0.3946704067321178, + "grad_norm": 1.8126571850198405, + "learning_rate": 6.8997694935518e-06, + "loss": 0.3804, + "step": 2814 + }, + { + "epoch": 0.39481065918653574, + "grad_norm": 1.916742620483538, + "learning_rate": 6.897668379283848e-06, + "loss": 0.3582, + "step": 2815 + }, + { + "epoch": 0.3949509116409537, + "grad_norm": 2.085359986329221, + "learning_rate": 6.895566873445285e-06, + "loss": 0.3685, + "step": 2816 + }, + { + "epoch": 0.39509116409537165, + "grad_norm": 1.7739849841956417, + "learning_rate": 6.893464976469739e-06, + "loss": 0.3853, + "step": 2817 + }, + { + "epoch": 0.3952314165497896, + "grad_norm": 2.078938604574011, + "learning_rate": 6.891362688790925e-06, + "loss": 0.3944, + "step": 2818 + }, + { + "epoch": 0.39537166900420756, + "grad_norm": 2.69869835525436, + "learning_rate": 6.889260010842633e-06, + "loss": 0.3546, + "step": 2819 + }, + { + "epoch": 0.3955119214586255, + "grad_norm": 1.9177047641050273, + "learning_rate": 6.887156943058739e-06, + "loss": 0.3697, + "step": 2820 + }, + { + "epoch": 0.39565217391304347, + "grad_norm": 1.7719737999229914, + "learning_rate": 6.8850534858731945e-06, + "loss": 0.3891, + "step": 2821 + }, + { + "epoch": 0.3957924263674614, + "grad_norm": 4.434331272976904, + "learning_rate": 6.882949639720032e-06, + "loss": 0.4028, + "step": 2822 + }, + { + "epoch": 0.3959326788218794, + "grad_norm": 2.1868988300953727, + "learning_rate": 6.880845405033368e-06, + "loss": 0.3669, + "step": 2823 + }, + { + "epoch": 0.39607293127629734, + "grad_norm": 1.9415471660966932, + "learning_rate": 6.878740782247395e-06, + "loss": 0.377, + "step": 2824 + }, + { + "epoch": 0.3962131837307153, + "grad_norm": 2.051923626205308, + "learning_rate": 6.876635771796386e-06, + "loss": 0.3483, + "step": 2825 + }, + { + "epoch": 0.39635343618513325, + "grad_norm": 2.033043289864516, + "learning_rate": 6.874530374114699e-06, + "loss": 0.351, + "step": 2826 + }, + { + "epoch": 0.3964936886395512, + "grad_norm": 2.5905405433425783, + "learning_rate": 6.8724245896367636e-06, + "loss": 0.4028, + "step": 2827 + }, + { + "epoch": 0.39663394109396916, + "grad_norm": 2.8603262231081437, + "learning_rate": 6.870318418797098e-06, + "loss": 0.369, + "step": 2828 + }, + { + "epoch": 0.3967741935483871, + "grad_norm": 2.0297749959339466, + "learning_rate": 6.868211862030291e-06, + "loss": 0.3454, + "step": 2829 + }, + { + "epoch": 0.39691444600280507, + "grad_norm": 2.1999661139566156, + "learning_rate": 6.86610491977102e-06, + "loss": 0.3576, + "step": 2830 + }, + { + "epoch": 0.397054698457223, + "grad_norm": 2.2875719653206534, + "learning_rate": 6.863997592454038e-06, + "loss": 0.3712, + "step": 2831 + }, + { + "epoch": 0.397194950911641, + "grad_norm": 2.701703618491338, + "learning_rate": 6.8618898805141744e-06, + "loss": 0.3603, + "step": 2832 + }, + { + "epoch": 0.39733520336605893, + "grad_norm": 2.0266571940443283, + "learning_rate": 6.859781784386341e-06, + "loss": 0.3878, + "step": 2833 + }, + { + "epoch": 0.39747545582047683, + "grad_norm": 2.7958568232800722, + "learning_rate": 6.857673304505532e-06, + "loss": 0.383, + "step": 2834 + }, + { + "epoch": 0.3976157082748948, + "grad_norm": 2.1007855762509493, + "learning_rate": 6.855564441306815e-06, + "loss": 0.3786, + "step": 2835 + }, + { + "epoch": 0.39775596072931274, + "grad_norm": 2.0260560755147847, + "learning_rate": 6.8534551952253395e-06, + "loss": 0.3629, + "step": 2836 + }, + { + "epoch": 0.3978962131837307, + "grad_norm": 2.089466781129425, + "learning_rate": 6.8513455666963325e-06, + "loss": 0.4235, + "step": 2837 + }, + { + "epoch": 0.39803646563814865, + "grad_norm": 3.1489299833885167, + "learning_rate": 6.849235556155103e-06, + "loss": 0.344, + "step": 2838 + }, + { + "epoch": 0.3981767180925666, + "grad_norm": 2.1933930717998162, + "learning_rate": 6.847125164037036e-06, + "loss": 0.3859, + "step": 2839 + }, + { + "epoch": 0.39831697054698456, + "grad_norm": 3.3305302914242843, + "learning_rate": 6.845014390777595e-06, + "loss": 0.3565, + "step": 2840 + }, + { + "epoch": 0.3984572230014025, + "grad_norm": 2.415918025988191, + "learning_rate": 6.842903236812328e-06, + "loss": 0.404, + "step": 2841 + }, + { + "epoch": 0.39859747545582047, + "grad_norm": 2.2959983346381443, + "learning_rate": 6.840791702576852e-06, + "loss": 0.3632, + "step": 2842 + }, + { + "epoch": 0.3987377279102384, + "grad_norm": 4.6335780008241665, + "learning_rate": 6.838679788506869e-06, + "loss": 0.4097, + "step": 2843 + }, + { + "epoch": 0.3988779803646564, + "grad_norm": 2.1784628432324666, + "learning_rate": 6.836567495038157e-06, + "loss": 0.3214, + "step": 2844 + }, + { + "epoch": 0.39901823281907434, + "grad_norm": 2.0384581390042467, + "learning_rate": 6.834454822606576e-06, + "loss": 0.3771, + "step": 2845 + }, + { + "epoch": 0.3991584852734923, + "grad_norm": 2.3244054382743755, + "learning_rate": 6.832341771648057e-06, + "loss": 0.3956, + "step": 2846 + }, + { + "epoch": 0.39929873772791025, + "grad_norm": 2.151519008133184, + "learning_rate": 6.830228342598615e-06, + "loss": 0.3344, + "step": 2847 + }, + { + "epoch": 0.3994389901823282, + "grad_norm": 2.012262304260118, + "learning_rate": 6.828114535894342e-06, + "loss": 0.3718, + "step": 2848 + }, + { + "epoch": 0.39957924263674616, + "grad_norm": 2.2242488150244557, + "learning_rate": 6.826000351971407e-06, + "loss": 0.4137, + "step": 2849 + }, + { + "epoch": 0.3997194950911641, + "grad_norm": 1.8661437369343805, + "learning_rate": 6.823885791266056e-06, + "loss": 0.3731, + "step": 2850 + }, + { + "epoch": 0.39985974754558207, + "grad_norm": 1.8568077636584384, + "learning_rate": 6.821770854214615e-06, + "loss": 0.3751, + "step": 2851 + }, + { + "epoch": 0.4, + "grad_norm": 2.0018616594959218, + "learning_rate": 6.819655541253487e-06, + "loss": 0.3782, + "step": 2852 + }, + { + "epoch": 0.400140252454418, + "grad_norm": 2.335391590030284, + "learning_rate": 6.817539852819149e-06, + "loss": 0.3833, + "step": 2853 + }, + { + "epoch": 0.40028050490883593, + "grad_norm": 1.939921619267324, + "learning_rate": 6.8154237893481625e-06, + "loss": 0.3408, + "step": 2854 + }, + { + "epoch": 0.40042075736325383, + "grad_norm": 2.414459195921677, + "learning_rate": 6.813307351277161e-06, + "loss": 0.366, + "step": 2855 + }, + { + "epoch": 0.4005610098176718, + "grad_norm": 2.281650351314389, + "learning_rate": 6.811190539042855e-06, + "loss": 0.3704, + "step": 2856 + }, + { + "epoch": 0.40070126227208974, + "grad_norm": 2.246586010934994, + "learning_rate": 6.809073353082038e-06, + "loss": 0.3385, + "step": 2857 + }, + { + "epoch": 0.4008415147265077, + "grad_norm": 2.0917018152857794, + "learning_rate": 6.8069557938315715e-06, + "loss": 0.3905, + "step": 2858 + }, + { + "epoch": 0.40098176718092565, + "grad_norm": 1.738901160640023, + "learning_rate": 6.8048378617284005e-06, + "loss": 0.4044, + "step": 2859 + }, + { + "epoch": 0.4011220196353436, + "grad_norm": 2.1443918318325537, + "learning_rate": 6.802719557209547e-06, + "loss": 0.3909, + "step": 2860 + }, + { + "epoch": 0.40126227208976156, + "grad_norm": 1.8905397506283899, + "learning_rate": 6.800600880712107e-06, + "loss": 0.3461, + "step": 2861 + }, + { + "epoch": 0.4014025245441795, + "grad_norm": 2.0879866575275186, + "learning_rate": 6.798481832673257e-06, + "loss": 0.4092, + "step": 2862 + }, + { + "epoch": 0.4015427769985975, + "grad_norm": 2.059589413029318, + "learning_rate": 6.796362413530245e-06, + "loss": 0.3434, + "step": 2863 + }, + { + "epoch": 0.4016830294530154, + "grad_norm": 2.804562111604573, + "learning_rate": 6.794242623720399e-06, + "loss": 0.4021, + "step": 2864 + }, + { + "epoch": 0.4018232819074334, + "grad_norm": 1.631267930440174, + "learning_rate": 6.792122463681126e-06, + "loss": 0.3617, + "step": 2865 + }, + { + "epoch": 0.40196353436185134, + "grad_norm": 2.1560401261511184, + "learning_rate": 6.7900019338499005e-06, + "loss": 0.3731, + "step": 2866 + }, + { + "epoch": 0.4021037868162693, + "grad_norm": 2.9970811313546992, + "learning_rate": 6.787881034664283e-06, + "loss": 0.3543, + "step": 2867 + }, + { + "epoch": 0.40224403927068725, + "grad_norm": 2.484675167352034, + "learning_rate": 6.785759766561903e-06, + "loss": 0.3826, + "step": 2868 + }, + { + "epoch": 0.4023842917251052, + "grad_norm": 2.8073681954808776, + "learning_rate": 6.783638129980474e-06, + "loss": 0.3664, + "step": 2869 + }, + { + "epoch": 0.40252454417952316, + "grad_norm": 2.3355159315326603, + "learning_rate": 6.781516125357777e-06, + "loss": 0.3834, + "step": 2870 + }, + { + "epoch": 0.4026647966339411, + "grad_norm": 1.9199631027808943, + "learning_rate": 6.779393753131674e-06, + "loss": 0.3182, + "step": 2871 + }, + { + "epoch": 0.40280504908835907, + "grad_norm": 3.8423943338651414, + "learning_rate": 6.7772710137401044e-06, + "loss": 0.3335, + "step": 2872 + }, + { + "epoch": 0.402945301542777, + "grad_norm": 2.3478489953764767, + "learning_rate": 6.775147907621076e-06, + "loss": 0.4064, + "step": 2873 + }, + { + "epoch": 0.403085553997195, + "grad_norm": 2.25245342007486, + "learning_rate": 6.773024435212678e-06, + "loss": 0.3506, + "step": 2874 + }, + { + "epoch": 0.4032258064516129, + "grad_norm": 3.3659917916681183, + "learning_rate": 6.770900596953076e-06, + "loss": 0.3354, + "step": 2875 + }, + { + "epoch": 0.40336605890603083, + "grad_norm": 1.9653768555641453, + "learning_rate": 6.76877639328051e-06, + "loss": 0.3697, + "step": 2876 + }, + { + "epoch": 0.4035063113604488, + "grad_norm": 2.072145046857204, + "learning_rate": 6.766651824633292e-06, + "loss": 0.38, + "step": 2877 + }, + { + "epoch": 0.40364656381486674, + "grad_norm": 2.5673818838553855, + "learning_rate": 6.764526891449813e-06, + "loss": 0.3232, + "step": 2878 + }, + { + "epoch": 0.4037868162692847, + "grad_norm": 1.9900222680269288, + "learning_rate": 6.762401594168537e-06, + "loss": 0.3591, + "step": 2879 + }, + { + "epoch": 0.40392706872370265, + "grad_norm": 1.9995406338530466, + "learning_rate": 6.7602759332280045e-06, + "loss": 0.393, + "step": 2880 + }, + { + "epoch": 0.4040673211781206, + "grad_norm": 2.0464191657197093, + "learning_rate": 6.758149909066832e-06, + "loss": 0.4433, + "step": 2881 + }, + { + "epoch": 0.40420757363253856, + "grad_norm": 2.5133510761326145, + "learning_rate": 6.7560235221237115e-06, + "loss": 0.3685, + "step": 2882 + }, + { + "epoch": 0.4043478260869565, + "grad_norm": 2.8399995658766723, + "learning_rate": 6.753896772837403e-06, + "loss": 0.361, + "step": 2883 + }, + { + "epoch": 0.4044880785413745, + "grad_norm": 2.3714684782593434, + "learning_rate": 6.75176966164675e-06, + "loss": 0.3374, + "step": 2884 + }, + { + "epoch": 0.40462833099579243, + "grad_norm": 5.2156490775638815, + "learning_rate": 6.749642188990666e-06, + "loss": 0.3834, + "step": 2885 + }, + { + "epoch": 0.4047685834502104, + "grad_norm": 2.3745669923875954, + "learning_rate": 6.74751435530814e-06, + "loss": 0.4136, + "step": 2886 + }, + { + "epoch": 0.40490883590462834, + "grad_norm": 1.9922008439159415, + "learning_rate": 6.745386161038237e-06, + "loss": 0.3961, + "step": 2887 + }, + { + "epoch": 0.4050490883590463, + "grad_norm": 2.007372499932463, + "learning_rate": 6.743257606620094e-06, + "loss": 0.3869, + "step": 2888 + }, + { + "epoch": 0.40518934081346425, + "grad_norm": 2.7073903031462896, + "learning_rate": 6.741128692492922e-06, + "loss": 0.3796, + "step": 2889 + }, + { + "epoch": 0.4053295932678822, + "grad_norm": 2.6032272373744014, + "learning_rate": 6.7389994190960085e-06, + "loss": 0.3423, + "step": 2890 + }, + { + "epoch": 0.40546984572230016, + "grad_norm": 2.130256210821289, + "learning_rate": 6.7368697868687146e-06, + "loss": 0.3426, + "step": 2891 + }, + { + "epoch": 0.4056100981767181, + "grad_norm": 2.107243343848278, + "learning_rate": 6.734739796250477e-06, + "loss": 0.4077, + "step": 2892 + }, + { + "epoch": 0.40575035063113607, + "grad_norm": 1.8965197637459976, + "learning_rate": 6.7326094476808e-06, + "loss": 0.3869, + "step": 2893 + }, + { + "epoch": 0.405890603085554, + "grad_norm": 2.0741441501341917, + "learning_rate": 6.730478741599269e-06, + "loss": 0.3754, + "step": 2894 + }, + { + "epoch": 0.4060308555399719, + "grad_norm": 2.019668426872192, + "learning_rate": 6.728347678445539e-06, + "loss": 0.4069, + "step": 2895 + }, + { + "epoch": 0.4061711079943899, + "grad_norm": 2.8716264918154124, + "learning_rate": 6.726216258659343e-06, + "loss": 0.3741, + "step": 2896 + }, + { + "epoch": 0.40631136044880783, + "grad_norm": 2.1436181964894208, + "learning_rate": 6.724084482680482e-06, + "loss": 0.383, + "step": 2897 + }, + { + "epoch": 0.4064516129032258, + "grad_norm": 2.4143706189895213, + "learning_rate": 6.721952350948833e-06, + "loss": 0.406, + "step": 2898 + }, + { + "epoch": 0.40659186535764374, + "grad_norm": 5.134607567282539, + "learning_rate": 6.719819863904345e-06, + "loss": 0.3649, + "step": 2899 + }, + { + "epoch": 0.4067321178120617, + "grad_norm": 2.0743840325949914, + "learning_rate": 6.717687021987045e-06, + "loss": 0.3866, + "step": 2900 + }, + { + "epoch": 0.40687237026647965, + "grad_norm": 1.9798454793688127, + "learning_rate": 6.715553825637029e-06, + "loss": 0.3995, + "step": 2901 + }, + { + "epoch": 0.4070126227208976, + "grad_norm": 1.9348766656253935, + "learning_rate": 6.713420275294467e-06, + "loss": 0.3652, + "step": 2902 + }, + { + "epoch": 0.40715287517531557, + "grad_norm": 2.099339213178405, + "learning_rate": 6.711286371399602e-06, + "loss": 0.4118, + "step": 2903 + }, + { + "epoch": 0.4072931276297335, + "grad_norm": 1.9887904506288367, + "learning_rate": 6.7091521143927495e-06, + "loss": 0.3395, + "step": 2904 + }, + { + "epoch": 0.4074333800841515, + "grad_norm": 2.0673660904717557, + "learning_rate": 6.707017504714299e-06, + "loss": 0.3737, + "step": 2905 + }, + { + "epoch": 0.40757363253856943, + "grad_norm": 2.2934740279901646, + "learning_rate": 6.704882542804714e-06, + "loss": 0.3864, + "step": 2906 + }, + { + "epoch": 0.4077138849929874, + "grad_norm": 2.0377739236564802, + "learning_rate": 6.702747229104527e-06, + "loss": 0.3544, + "step": 2907 + }, + { + "epoch": 0.40785413744740534, + "grad_norm": 2.001311825637749, + "learning_rate": 6.700611564054346e-06, + "loss": 0.3752, + "step": 2908 + }, + { + "epoch": 0.4079943899018233, + "grad_norm": 2.2973312651362363, + "learning_rate": 6.69847554809485e-06, + "loss": 0.3609, + "step": 2909 + }, + { + "epoch": 0.40813464235624125, + "grad_norm": 2.6263678483699064, + "learning_rate": 6.696339181666791e-06, + "loss": 0.3732, + "step": 2910 + }, + { + "epoch": 0.4082748948106592, + "grad_norm": 2.1224398056498983, + "learning_rate": 6.694202465210993e-06, + "loss": 0.3791, + "step": 2911 + }, + { + "epoch": 0.40841514726507716, + "grad_norm": 4.583175633347297, + "learning_rate": 6.692065399168352e-06, + "loss": 0.3978, + "step": 2912 + }, + { + "epoch": 0.4085553997194951, + "grad_norm": 2.435008903502806, + "learning_rate": 6.689927983979841e-06, + "loss": 0.3441, + "step": 2913 + }, + { + "epoch": 0.40869565217391307, + "grad_norm": 2.663327023288221, + "learning_rate": 6.687790220086494e-06, + "loss": 0.3421, + "step": 2914 + }, + { + "epoch": 0.40883590462833097, + "grad_norm": 1.9536352482909567, + "learning_rate": 6.6856521079294275e-06, + "loss": 0.3704, + "step": 2915 + }, + { + "epoch": 0.4089761570827489, + "grad_norm": 2.1554075060249764, + "learning_rate": 6.683513647949826e-06, + "loss": 0.3767, + "step": 2916 + }, + { + "epoch": 0.4091164095371669, + "grad_norm": 5.60575337501475, + "learning_rate": 6.681374840588946e-06, + "loss": 0.3412, + "step": 2917 + }, + { + "epoch": 0.40925666199158484, + "grad_norm": 1.9187452061496952, + "learning_rate": 6.6792356862881144e-06, + "loss": 0.3601, + "step": 2918 + }, + { + "epoch": 0.4093969144460028, + "grad_norm": 1.7575353107188094, + "learning_rate": 6.6770961854887296e-06, + "loss": 0.382, + "step": 2919 + }, + { + "epoch": 0.40953716690042075, + "grad_norm": 1.9285661164972525, + "learning_rate": 6.674956338632265e-06, + "loss": 0.3702, + "step": 2920 + }, + { + "epoch": 0.4096774193548387, + "grad_norm": 2.3891920395155704, + "learning_rate": 6.672816146160262e-06, + "loss": 0.3662, + "step": 2921 + }, + { + "epoch": 0.40981767180925666, + "grad_norm": 2.0833510461690046, + "learning_rate": 6.6706756085143345e-06, + "loss": 0.3816, + "step": 2922 + }, + { + "epoch": 0.4099579242636746, + "grad_norm": 1.9985807867034608, + "learning_rate": 6.668534726136166e-06, + "loss": 0.3535, + "step": 2923 + }, + { + "epoch": 0.41009817671809257, + "grad_norm": 2.491742611581422, + "learning_rate": 6.666393499467516e-06, + "loss": 0.3717, + "step": 2924 + }, + { + "epoch": 0.4102384291725105, + "grad_norm": 2.2670765403342443, + "learning_rate": 6.664251928950209e-06, + "loss": 0.3624, + "step": 2925 + }, + { + "epoch": 0.4103786816269285, + "grad_norm": 2.3017248734782525, + "learning_rate": 6.662110015026144e-06, + "loss": 0.3265, + "step": 2926 + }, + { + "epoch": 0.41051893408134643, + "grad_norm": 2.415196420962704, + "learning_rate": 6.659967758137289e-06, + "loss": 0.3697, + "step": 2927 + }, + { + "epoch": 0.4106591865357644, + "grad_norm": 2.2957168046440843, + "learning_rate": 6.657825158725686e-06, + "loss": 0.4034, + "step": 2928 + }, + { + "epoch": 0.41079943899018234, + "grad_norm": 2.3243421505778534, + "learning_rate": 6.655682217233445e-06, + "loss": 0.3815, + "step": 2929 + }, + { + "epoch": 0.4109396914446003, + "grad_norm": 2.47256917935782, + "learning_rate": 6.653538934102743e-06, + "loss": 0.3529, + "step": 2930 + }, + { + "epoch": 0.41107994389901825, + "grad_norm": 2.711316524690193, + "learning_rate": 6.651395309775837e-06, + "loss": 0.3833, + "step": 2931 + }, + { + "epoch": 0.4112201963534362, + "grad_norm": 2.745401438975129, + "learning_rate": 6.6492513446950444e-06, + "loss": 0.4036, + "step": 2932 + }, + { + "epoch": 0.41136044880785416, + "grad_norm": 2.242587914418082, + "learning_rate": 6.64710703930276e-06, + "loss": 0.3714, + "step": 2933 + }, + { + "epoch": 0.4115007012622721, + "grad_norm": 2.0559998009306817, + "learning_rate": 6.644962394041447e-06, + "loss": 0.3982, + "step": 2934 + }, + { + "epoch": 0.41164095371669, + "grad_norm": 2.4016660153001816, + "learning_rate": 6.642817409353635e-06, + "loss": 0.3709, + "step": 2935 + }, + { + "epoch": 0.41178120617110797, + "grad_norm": 1.9817901356409142, + "learning_rate": 6.640672085681928e-06, + "loss": 0.3778, + "step": 2936 + }, + { + "epoch": 0.4119214586255259, + "grad_norm": 2.526899295740091, + "learning_rate": 6.638526423468999e-06, + "loss": 0.3267, + "step": 2937 + }, + { + "epoch": 0.4120617110799439, + "grad_norm": 1.7734660679873095, + "learning_rate": 6.636380423157591e-06, + "loss": 0.3817, + "step": 2938 + }, + { + "epoch": 0.41220196353436184, + "grad_norm": 3.027816085884175, + "learning_rate": 6.634234085190516e-06, + "loss": 0.3904, + "step": 2939 + }, + { + "epoch": 0.4123422159887798, + "grad_norm": 3.141939258301542, + "learning_rate": 6.632087410010653e-06, + "loss": 0.3346, + "step": 2940 + }, + { + "epoch": 0.41248246844319775, + "grad_norm": 2.0572552899070016, + "learning_rate": 6.629940398060957e-06, + "loss": 0.3598, + "step": 2941 + }, + { + "epoch": 0.4126227208976157, + "grad_norm": 1.9555734165175536, + "learning_rate": 6.627793049784448e-06, + "loss": 0.3121, + "step": 2942 + }, + { + "epoch": 0.41276297335203366, + "grad_norm": 2.9455637533747785, + "learning_rate": 6.625645365624214e-06, + "loss": 0.3714, + "step": 2943 + }, + { + "epoch": 0.4129032258064516, + "grad_norm": 2.6087770976261804, + "learning_rate": 6.6234973460234184e-06, + "loss": 0.3658, + "step": 2944 + }, + { + "epoch": 0.41304347826086957, + "grad_norm": 2.6286738889580326, + "learning_rate": 6.621348991425287e-06, + "loss": 0.3766, + "step": 2945 + }, + { + "epoch": 0.4131837307152875, + "grad_norm": 2.465878325431431, + "learning_rate": 6.619200302273119e-06, + "loss": 0.3786, + "step": 2946 + }, + { + "epoch": 0.4133239831697055, + "grad_norm": 2.3915526142461854, + "learning_rate": 6.61705127901028e-06, + "loss": 0.4377, + "step": 2947 + }, + { + "epoch": 0.41346423562412343, + "grad_norm": 2.6531857037648363, + "learning_rate": 6.614901922080211e-06, + "loss": 0.3468, + "step": 2948 + }, + { + "epoch": 0.4136044880785414, + "grad_norm": 2.432788893352973, + "learning_rate": 6.612752231926411e-06, + "loss": 0.3318, + "step": 2949 + }, + { + "epoch": 0.41374474053295934, + "grad_norm": 2.0949029737050546, + "learning_rate": 6.6106022089924535e-06, + "loss": 0.3524, + "step": 2950 + }, + { + "epoch": 0.4138849929873773, + "grad_norm": 2.0690827946199812, + "learning_rate": 6.608451853721985e-06, + "loss": 0.4044, + "step": 2951 + }, + { + "epoch": 0.41402524544179525, + "grad_norm": 2.384088893538771, + "learning_rate": 6.606301166558713e-06, + "loss": 0.388, + "step": 2952 + }, + { + "epoch": 0.4141654978962132, + "grad_norm": 2.887515648473307, + "learning_rate": 6.604150147946418e-06, + "loss": 0.3435, + "step": 2953 + }, + { + "epoch": 0.41430575035063116, + "grad_norm": 3.2461522139553147, + "learning_rate": 6.601998798328948e-06, + "loss": 0.3906, + "step": 2954 + }, + { + "epoch": 0.41444600280504906, + "grad_norm": 2.1591470026641932, + "learning_rate": 6.599847118150218e-06, + "loss": 0.4012, + "step": 2955 + }, + { + "epoch": 0.414586255259467, + "grad_norm": 2.184112760113168, + "learning_rate": 6.597695107854212e-06, + "loss": 0.4034, + "step": 2956 + }, + { + "epoch": 0.414726507713885, + "grad_norm": 2.247956937818674, + "learning_rate": 6.595542767884984e-06, + "loss": 0.3556, + "step": 2957 + }, + { + "epoch": 0.41486676016830293, + "grad_norm": 2.202788848122221, + "learning_rate": 6.593390098686653e-06, + "loss": 0.3397, + "step": 2958 + }, + { + "epoch": 0.4150070126227209, + "grad_norm": 2.49757401059819, + "learning_rate": 6.591237100703407e-06, + "loss": 0.4076, + "step": 2959 + }, + { + "epoch": 0.41514726507713884, + "grad_norm": 1.9641122840513912, + "learning_rate": 6.589083774379503e-06, + "loss": 0.3868, + "step": 2960 + }, + { + "epoch": 0.4152875175315568, + "grad_norm": 2.819796443653793, + "learning_rate": 6.586930120159263e-06, + "loss": 0.3656, + "step": 2961 + }, + { + "epoch": 0.41542776998597475, + "grad_norm": 2.3057450276643507, + "learning_rate": 6.584776138487081e-06, + "loss": 0.3944, + "step": 2962 + }, + { + "epoch": 0.4155680224403927, + "grad_norm": 1.857510826315462, + "learning_rate": 6.5826218298074144e-06, + "loss": 0.3756, + "step": 2963 + }, + { + "epoch": 0.41570827489481066, + "grad_norm": 3.068894075635903, + "learning_rate": 6.5804671945647916e-06, + "loss": 0.3729, + "step": 2964 + }, + { + "epoch": 0.4158485273492286, + "grad_norm": 2.6594827166212838, + "learning_rate": 6.578312233203804e-06, + "loss": 0.3198, + "step": 2965 + }, + { + "epoch": 0.41598877980364657, + "grad_norm": 2.942706910979791, + "learning_rate": 6.5761569461691145e-06, + "loss": 0.3957, + "step": 2966 + }, + { + "epoch": 0.4161290322580645, + "grad_norm": 2.711136868316297, + "learning_rate": 6.57400133390545e-06, + "loss": 0.4335, + "step": 2967 + }, + { + "epoch": 0.4162692847124825, + "grad_norm": 2.447395388244688, + "learning_rate": 6.5718453968576076e-06, + "loss": 0.3956, + "step": 2968 + }, + { + "epoch": 0.41640953716690043, + "grad_norm": 2.216683620971242, + "learning_rate": 6.569689135470451e-06, + "loss": 0.3778, + "step": 2969 + }, + { + "epoch": 0.4165497896213184, + "grad_norm": 3.577581639241376, + "learning_rate": 6.567532550188908e-06, + "loss": 0.3818, + "step": 2970 + }, + { + "epoch": 0.41669004207573634, + "grad_norm": 2.2638294339580196, + "learning_rate": 6.565375641457973e-06, + "loss": 0.3656, + "step": 2971 + }, + { + "epoch": 0.4168302945301543, + "grad_norm": 2.2339672436818554, + "learning_rate": 6.563218409722712e-06, + "loss": 0.3535, + "step": 2972 + }, + { + "epoch": 0.41697054698457225, + "grad_norm": 2.306871833809432, + "learning_rate": 6.561060855428252e-06, + "loss": 0.3854, + "step": 2973 + }, + { + "epoch": 0.4171107994389902, + "grad_norm": 2.45367503226088, + "learning_rate": 6.558902979019793e-06, + "loss": 0.3581, + "step": 2974 + }, + { + "epoch": 0.4172510518934081, + "grad_norm": 2.5308552186899136, + "learning_rate": 6.556744780942594e-06, + "loss": 0.3544, + "step": 2975 + }, + { + "epoch": 0.41739130434782606, + "grad_norm": 2.720058345589643, + "learning_rate": 6.5545862616419865e-06, + "loss": 0.3867, + "step": 2976 + }, + { + "epoch": 0.417531556802244, + "grad_norm": 2.163755098950633, + "learning_rate": 6.552427421563365e-06, + "loss": 0.3569, + "step": 2977 + }, + { + "epoch": 0.417671809256662, + "grad_norm": 2.936813761317621, + "learning_rate": 6.550268261152192e-06, + "loss": 0.3576, + "step": 2978 + }, + { + "epoch": 0.41781206171107993, + "grad_norm": 2.255972881681715, + "learning_rate": 6.548108780853995e-06, + "loss": 0.3863, + "step": 2979 + }, + { + "epoch": 0.4179523141654979, + "grad_norm": 2.7095336797197493, + "learning_rate": 6.545948981114365e-06, + "loss": 0.3861, + "step": 2980 + }, + { + "epoch": 0.41809256661991584, + "grad_norm": 2.091114378075385, + "learning_rate": 6.543788862378965e-06, + "loss": 0.3753, + "step": 2981 + }, + { + "epoch": 0.4182328190743338, + "grad_norm": 1.9166822294626182, + "learning_rate": 6.541628425093518e-06, + "loss": 0.3571, + "step": 2982 + }, + { + "epoch": 0.41837307152875175, + "grad_norm": 2.768288315972146, + "learning_rate": 6.539467669703816e-06, + "loss": 0.3893, + "step": 2983 + }, + { + "epoch": 0.4185133239831697, + "grad_norm": 2.04604526621096, + "learning_rate": 6.537306596655716e-06, + "loss": 0.3529, + "step": 2984 + }, + { + "epoch": 0.41865357643758766, + "grad_norm": 2.2046941485395966, + "learning_rate": 6.535145206395141e-06, + "loss": 0.4155, + "step": 2985 + }, + { + "epoch": 0.4187938288920056, + "grad_norm": 2.124995960904939, + "learning_rate": 6.532983499368078e-06, + "loss": 0.4011, + "step": 2986 + }, + { + "epoch": 0.41893408134642357, + "grad_norm": 2.75722093153955, + "learning_rate": 6.530821476020579e-06, + "loss": 0.3531, + "step": 2987 + }, + { + "epoch": 0.4190743338008415, + "grad_norm": 2.3984737828396194, + "learning_rate": 6.5286591367987655e-06, + "loss": 0.3654, + "step": 2988 + }, + { + "epoch": 0.4192145862552595, + "grad_norm": 1.9285650038738311, + "learning_rate": 6.5264964821488184e-06, + "loss": 0.3974, + "step": 2989 + }, + { + "epoch": 0.41935483870967744, + "grad_norm": 1.9835378127382342, + "learning_rate": 6.524333512516987e-06, + "loss": 0.353, + "step": 2990 + }, + { + "epoch": 0.4194950911640954, + "grad_norm": 1.8940887248960436, + "learning_rate": 6.522170228349585e-06, + "loss": 0.412, + "step": 2991 + }, + { + "epoch": 0.41963534361851335, + "grad_norm": 2.8751978598801635, + "learning_rate": 6.520006630092991e-06, + "loss": 0.3996, + "step": 2992 + }, + { + "epoch": 0.4197755960729313, + "grad_norm": 2.075483091259303, + "learning_rate": 6.5178427181936485e-06, + "loss": 0.4116, + "step": 2993 + }, + { + "epoch": 0.41991584852734926, + "grad_norm": 1.8964327678474917, + "learning_rate": 6.515678493098065e-06, + "loss": 0.4029, + "step": 2994 + }, + { + "epoch": 0.42005610098176716, + "grad_norm": 1.9062612564505, + "learning_rate": 6.513513955252816e-06, + "loss": 0.3984, + "step": 2995 + }, + { + "epoch": 0.4201963534361851, + "grad_norm": 2.2711529871324596, + "learning_rate": 6.511349105104534e-06, + "loss": 0.4117, + "step": 2996 + }, + { + "epoch": 0.42033660589060307, + "grad_norm": 2.1683068913873877, + "learning_rate": 6.509183943099925e-06, + "loss": 0.4131, + "step": 2997 + }, + { + "epoch": 0.420476858345021, + "grad_norm": 1.8909044295700346, + "learning_rate": 6.507018469685752e-06, + "loss": 0.3843, + "step": 2998 + }, + { + "epoch": 0.420617110799439, + "grad_norm": 1.9831648012357606, + "learning_rate": 6.504852685308849e-06, + "loss": 0.3762, + "step": 2999 + }, + { + "epoch": 0.42075736325385693, + "grad_norm": 1.979135941384055, + "learning_rate": 6.502686590416105e-06, + "loss": 0.37, + "step": 3000 + }, + { + "epoch": 0.4208976157082749, + "grad_norm": 2.6214865832148257, + "learning_rate": 6.5005201854544845e-06, + "loss": 0.3603, + "step": 3001 + }, + { + "epoch": 0.42103786816269284, + "grad_norm": 2.2516730763842734, + "learning_rate": 6.498353470871006e-06, + "loss": 0.3892, + "step": 3002 + }, + { + "epoch": 0.4211781206171108, + "grad_norm": 1.8068297251119418, + "learning_rate": 6.4961864471127556e-06, + "loss": 0.3856, + "step": 3003 + }, + { + "epoch": 0.42131837307152875, + "grad_norm": 2.144867528457207, + "learning_rate": 6.494019114626887e-06, + "loss": 0.3794, + "step": 3004 + }, + { + "epoch": 0.4214586255259467, + "grad_norm": 2.861507100311274, + "learning_rate": 6.491851473860612e-06, + "loss": 0.3856, + "step": 3005 + }, + { + "epoch": 0.42159887798036466, + "grad_norm": 2.0756143878319735, + "learning_rate": 6.489683525261208e-06, + "loss": 0.3853, + "step": 3006 + }, + { + "epoch": 0.4217391304347826, + "grad_norm": 2.1205773049771794, + "learning_rate": 6.487515269276015e-06, + "loss": 0.3988, + "step": 3007 + }, + { + "epoch": 0.42187938288920057, + "grad_norm": 2.0807943767566957, + "learning_rate": 6.48534670635244e-06, + "loss": 0.3648, + "step": 3008 + }, + { + "epoch": 0.4220196353436185, + "grad_norm": 2.4839576034674935, + "learning_rate": 6.48317783693795e-06, + "loss": 0.3622, + "step": 3009 + }, + { + "epoch": 0.4221598877980365, + "grad_norm": 1.75251677598964, + "learning_rate": 6.481008661480075e-06, + "loss": 0.4208, + "step": 3010 + }, + { + "epoch": 0.42230014025245444, + "grad_norm": 2.088056766476447, + "learning_rate": 6.478839180426411e-06, + "loss": 0.358, + "step": 3011 + }, + { + "epoch": 0.4224403927068724, + "grad_norm": 2.08511153309319, + "learning_rate": 6.476669394224613e-06, + "loss": 0.3157, + "step": 3012 + }, + { + "epoch": 0.42258064516129035, + "grad_norm": 1.9006509945567969, + "learning_rate": 6.474499303322402e-06, + "loss": 0.3775, + "step": 3013 + }, + { + "epoch": 0.4227208976157083, + "grad_norm": 2.117886698789312, + "learning_rate": 6.472328908167562e-06, + "loss": 0.3546, + "step": 3014 + }, + { + "epoch": 0.4228611500701262, + "grad_norm": 2.6404464029262313, + "learning_rate": 6.470158209207939e-06, + "loss": 0.3631, + "step": 3015 + }, + { + "epoch": 0.42300140252454416, + "grad_norm": 2.1789452167362633, + "learning_rate": 6.46798720689144e-06, + "loss": 0.3697, + "step": 3016 + }, + { + "epoch": 0.4231416549789621, + "grad_norm": 2.544308357912146, + "learning_rate": 6.465815901666036e-06, + "loss": 0.3836, + "step": 3017 + }, + { + "epoch": 0.42328190743338007, + "grad_norm": 1.7922059807327582, + "learning_rate": 6.463644293979763e-06, + "loss": 0.345, + "step": 3018 + }, + { + "epoch": 0.423422159887798, + "grad_norm": 1.8206209240706366, + "learning_rate": 6.461472384280715e-06, + "loss": 0.3353, + "step": 3019 + }, + { + "epoch": 0.423562412342216, + "grad_norm": 2.194489348498048, + "learning_rate": 6.459300173017052e-06, + "loss": 0.4084, + "step": 3020 + }, + { + "epoch": 0.42370266479663393, + "grad_norm": 3.0100395380002416, + "learning_rate": 6.457127660636994e-06, + "loss": 0.3663, + "step": 3021 + }, + { + "epoch": 0.4238429172510519, + "grad_norm": 1.8586443579487488, + "learning_rate": 6.454954847588824e-06, + "loss": 0.3797, + "step": 3022 + }, + { + "epoch": 0.42398316970546984, + "grad_norm": 2.561671425445379, + "learning_rate": 6.452781734320884e-06, + "loss": 0.3769, + "step": 3023 + }, + { + "epoch": 0.4241234221598878, + "grad_norm": 2.2018671435503734, + "learning_rate": 6.450608321281584e-06, + "loss": 0.3653, + "step": 3024 + }, + { + "epoch": 0.42426367461430575, + "grad_norm": 2.679648796669525, + "learning_rate": 6.4484346089193926e-06, + "loss": 0.3841, + "step": 3025 + }, + { + "epoch": 0.4244039270687237, + "grad_norm": 2.034056734424741, + "learning_rate": 6.4462605976828395e-06, + "loss": 0.3898, + "step": 3026 + }, + { + "epoch": 0.42454417952314166, + "grad_norm": 3.5660952446028555, + "learning_rate": 6.444086288020514e-06, + "loss": 0.3988, + "step": 3027 + }, + { + "epoch": 0.4246844319775596, + "grad_norm": 1.6028086210677315, + "learning_rate": 6.441911680381074e-06, + "loss": 0.3815, + "step": 3028 + }, + { + "epoch": 0.4248246844319776, + "grad_norm": 2.2986737947346096, + "learning_rate": 6.4397367752132325e-06, + "loss": 0.3505, + "step": 3029 + }, + { + "epoch": 0.42496493688639553, + "grad_norm": 2.0119138160129935, + "learning_rate": 6.437561572965767e-06, + "loss": 0.3845, + "step": 3030 + }, + { + "epoch": 0.4251051893408135, + "grad_norm": 2.2746656318847043, + "learning_rate": 6.435386074087514e-06, + "loss": 0.3433, + "step": 3031 + }, + { + "epoch": 0.42524544179523144, + "grad_norm": 2.1931543568613723, + "learning_rate": 6.433210279027373e-06, + "loss": 0.3484, + "step": 3032 + }, + { + "epoch": 0.4253856942496494, + "grad_norm": 2.0355128233994577, + "learning_rate": 6.431034188234304e-06, + "loss": 0.3253, + "step": 3033 + }, + { + "epoch": 0.42552594670406735, + "grad_norm": 2.4458276370587106, + "learning_rate": 6.4288578021573275e-06, + "loss": 0.3651, + "step": 3034 + }, + { + "epoch": 0.42566619915848525, + "grad_norm": 1.9610652198080876, + "learning_rate": 6.426681121245527e-06, + "loss": 0.3597, + "step": 3035 + }, + { + "epoch": 0.4258064516129032, + "grad_norm": 1.7913879872724987, + "learning_rate": 6.424504145948045e-06, + "loss": 0.3895, + "step": 3036 + }, + { + "epoch": 0.42594670406732116, + "grad_norm": 2.1310446542387083, + "learning_rate": 6.422326876714084e-06, + "loss": 0.3455, + "step": 3037 + }, + { + "epoch": 0.4260869565217391, + "grad_norm": 2.4881883061894863, + "learning_rate": 6.420149313992909e-06, + "loss": 0.3599, + "step": 3038 + }, + { + "epoch": 0.42622720897615707, + "grad_norm": 2.0379705903661285, + "learning_rate": 6.417971458233847e-06, + "loss": 0.3718, + "step": 3039 + }, + { + "epoch": 0.426367461430575, + "grad_norm": 1.8652640934518958, + "learning_rate": 6.41579330988628e-06, + "loss": 0.3655, + "step": 3040 + }, + { + "epoch": 0.426507713884993, + "grad_norm": 2.480208063319188, + "learning_rate": 6.413614869399655e-06, + "loss": 0.3985, + "step": 3041 + }, + { + "epoch": 0.42664796633941093, + "grad_norm": 2.043465140086211, + "learning_rate": 6.411436137223479e-06, + "loss": 0.3542, + "step": 3042 + }, + { + "epoch": 0.4267882187938289, + "grad_norm": 2.6543387324813104, + "learning_rate": 6.409257113807316e-06, + "loss": 0.3631, + "step": 3043 + }, + { + "epoch": 0.42692847124824684, + "grad_norm": 1.9786603447272457, + "learning_rate": 6.4070777996007925e-06, + "loss": 0.3625, + "step": 3044 + }, + { + "epoch": 0.4270687237026648, + "grad_norm": 2.008187819257285, + "learning_rate": 6.4048981950535975e-06, + "loss": 0.4293, + "step": 3045 + }, + { + "epoch": 0.42720897615708275, + "grad_norm": 3.1085667158640073, + "learning_rate": 6.402718300615475e-06, + "loss": 0.3624, + "step": 3046 + }, + { + "epoch": 0.4273492286115007, + "grad_norm": 2.0100220867860323, + "learning_rate": 6.40053811673623e-06, + "loss": 0.3892, + "step": 3047 + }, + { + "epoch": 0.42748948106591866, + "grad_norm": 2.045806605934265, + "learning_rate": 6.398357643865731e-06, + "loss": 0.3523, + "step": 3048 + }, + { + "epoch": 0.4276297335203366, + "grad_norm": 2.3876733447353446, + "learning_rate": 6.396176882453902e-06, + "loss": 0.3669, + "step": 3049 + }, + { + "epoch": 0.4277699859747546, + "grad_norm": 2.5873330030132276, + "learning_rate": 6.393995832950725e-06, + "loss": 0.3676, + "step": 3050 + }, + { + "epoch": 0.42791023842917253, + "grad_norm": 4.398591006701837, + "learning_rate": 6.391814495806251e-06, + "loss": 0.3863, + "step": 3051 + }, + { + "epoch": 0.4280504908835905, + "grad_norm": 2.101332981665496, + "learning_rate": 6.389632871470578e-06, + "loss": 0.3602, + "step": 3052 + }, + { + "epoch": 0.42819074333800844, + "grad_norm": 2.943751723497538, + "learning_rate": 6.3874509603938706e-06, + "loss": 0.3634, + "step": 3053 + }, + { + "epoch": 0.4283309957924264, + "grad_norm": 1.9780114935298978, + "learning_rate": 6.385268763026351e-06, + "loss": 0.3304, + "step": 3054 + }, + { + "epoch": 0.4284712482468443, + "grad_norm": 2.2855364100181284, + "learning_rate": 6.3830862798183006e-06, + "loss": 0.3792, + "step": 3055 + }, + { + "epoch": 0.42861150070126225, + "grad_norm": 1.7138861065551223, + "learning_rate": 6.38090351122006e-06, + "loss": 0.3556, + "step": 3056 + }, + { + "epoch": 0.4287517531556802, + "grad_norm": 2.032355491878188, + "learning_rate": 6.378720457682027e-06, + "loss": 0.4151, + "step": 3057 + }, + { + "epoch": 0.42889200561009816, + "grad_norm": 2.2035514472926376, + "learning_rate": 6.37653711965466e-06, + "loss": 0.4183, + "step": 3058 + }, + { + "epoch": 0.4290322580645161, + "grad_norm": 1.7776122173399367, + "learning_rate": 6.374353497588475e-06, + "loss": 0.3088, + "step": 3059 + }, + { + "epoch": 0.42917251051893407, + "grad_norm": 2.377906727141462, + "learning_rate": 6.372169591934048e-06, + "loss": 0.4141, + "step": 3060 + }, + { + "epoch": 0.429312762973352, + "grad_norm": 2.4111111571589796, + "learning_rate": 6.369985403142014e-06, + "loss": 0.3828, + "step": 3061 + }, + { + "epoch": 0.42945301542777, + "grad_norm": 2.365883445983861, + "learning_rate": 6.367800931663062e-06, + "loss": 0.3847, + "step": 3062 + }, + { + "epoch": 0.42959326788218793, + "grad_norm": 3.5569456201170992, + "learning_rate": 6.365616177947945e-06, + "loss": 0.365, + "step": 3063 + }, + { + "epoch": 0.4297335203366059, + "grad_norm": 2.2949676898684834, + "learning_rate": 6.363431142447469e-06, + "loss": 0.3995, + "step": 3064 + }, + { + "epoch": 0.42987377279102384, + "grad_norm": 1.9338572312986662, + "learning_rate": 6.361245825612505e-06, + "loss": 0.3936, + "step": 3065 + }, + { + "epoch": 0.4300140252454418, + "grad_norm": 3.179374060775951, + "learning_rate": 6.359060227893972e-06, + "loss": 0.3405, + "step": 3066 + }, + { + "epoch": 0.43015427769985976, + "grad_norm": 1.825747190654818, + "learning_rate": 6.356874349742859e-06, + "loss": 0.3736, + "step": 3067 + }, + { + "epoch": 0.4302945301542777, + "grad_norm": 2.475885148184037, + "learning_rate": 6.354688191610202e-06, + "loss": 0.3953, + "step": 3068 + }, + { + "epoch": 0.43043478260869567, + "grad_norm": 2.036260323682717, + "learning_rate": 6.352501753947103e-06, + "loss": 0.3756, + "step": 3069 + }, + { + "epoch": 0.4305750350631136, + "grad_norm": 2.7191551169749215, + "learning_rate": 6.350315037204714e-06, + "loss": 0.3961, + "step": 3070 + }, + { + "epoch": 0.4307152875175316, + "grad_norm": 2.023818874871799, + "learning_rate": 6.3481280418342536e-06, + "loss": 0.3247, + "step": 3071 + }, + { + "epoch": 0.43085553997194953, + "grad_norm": 2.3403417601802006, + "learning_rate": 6.3459407682869885e-06, + "loss": 0.3359, + "step": 3072 + }, + { + "epoch": 0.4309957924263675, + "grad_norm": 1.935849132476107, + "learning_rate": 6.34375321701425e-06, + "loss": 0.3849, + "step": 3073 + }, + { + "epoch": 0.43113604488078544, + "grad_norm": 1.7900019015936102, + "learning_rate": 6.341565388467425e-06, + "loss": 0.401, + "step": 3074 + }, + { + "epoch": 0.43127629733520334, + "grad_norm": 2.0797246068428428, + "learning_rate": 6.339377283097953e-06, + "loss": 0.4215, + "step": 3075 + }, + { + "epoch": 0.4314165497896213, + "grad_norm": 1.7003992649650197, + "learning_rate": 6.3371889013573365e-06, + "loss": 0.3835, + "step": 3076 + }, + { + "epoch": 0.43155680224403925, + "grad_norm": 1.8710926032013058, + "learning_rate": 6.335000243697134e-06, + "loss": 0.3161, + "step": 3077 + }, + { + "epoch": 0.4316970546984572, + "grad_norm": 2.3780231308642255, + "learning_rate": 6.332811310568956e-06, + "loss": 0.3765, + "step": 3078 + }, + { + "epoch": 0.43183730715287516, + "grad_norm": 2.3691038929229435, + "learning_rate": 6.330622102424478e-06, + "loss": 0.4037, + "step": 3079 + }, + { + "epoch": 0.4319775596072931, + "grad_norm": 2.052958642267089, + "learning_rate": 6.328432619715424e-06, + "loss": 0.3626, + "step": 3080 + }, + { + "epoch": 0.43211781206171107, + "grad_norm": 1.9562199599424515, + "learning_rate": 6.326242862893581e-06, + "loss": 0.3449, + "step": 3081 + }, + { + "epoch": 0.432258064516129, + "grad_norm": 2.0147609780960725, + "learning_rate": 6.324052832410788e-06, + "loss": 0.3595, + "step": 3082 + }, + { + "epoch": 0.432398316970547, + "grad_norm": 2.3811630793849927, + "learning_rate": 6.321862528718945e-06, + "loss": 0.378, + "step": 3083 + }, + { + "epoch": 0.43253856942496494, + "grad_norm": 2.5347334828119386, + "learning_rate": 6.319671952270004e-06, + "loss": 0.3383, + "step": 3084 + }, + { + "epoch": 0.4326788218793829, + "grad_norm": 2.044908583633381, + "learning_rate": 6.317481103515976e-06, + "loss": 0.3692, + "step": 3085 + }, + { + "epoch": 0.43281907433380085, + "grad_norm": 2.2898061540342183, + "learning_rate": 6.3152899829089254e-06, + "loss": 0.3529, + "step": 3086 + }, + { + "epoch": 0.4329593267882188, + "grad_norm": 2.8818273334122657, + "learning_rate": 6.313098590900978e-06, + "loss": 0.3856, + "step": 3087 + }, + { + "epoch": 0.43309957924263676, + "grad_norm": 2.1146473867051396, + "learning_rate": 6.310906927944309e-06, + "loss": 0.421, + "step": 3088 + }, + { + "epoch": 0.4332398316970547, + "grad_norm": 2.254707815352376, + "learning_rate": 6.308714994491155e-06, + "loss": 0.3892, + "step": 3089 + }, + { + "epoch": 0.43338008415147267, + "grad_norm": 3.983253829886218, + "learning_rate": 6.306522790993805e-06, + "loss": 0.3853, + "step": 3090 + }, + { + "epoch": 0.4335203366058906, + "grad_norm": 1.98163914754682, + "learning_rate": 6.304330317904605e-06, + "loss": 0.386, + "step": 3091 + }, + { + "epoch": 0.4336605890603086, + "grad_norm": 2.060933645387968, + "learning_rate": 6.3021375756759575e-06, + "loss": 0.3571, + "step": 3092 + }, + { + "epoch": 0.43380084151472653, + "grad_norm": 2.2093568145564673, + "learning_rate": 6.299944564760318e-06, + "loss": 0.367, + "step": 3093 + }, + { + "epoch": 0.4339410939691445, + "grad_norm": 6.592785226830853, + "learning_rate": 6.2977512856101994e-06, + "loss": 0.3878, + "step": 3094 + }, + { + "epoch": 0.4340813464235624, + "grad_norm": 5.216528322540592, + "learning_rate": 6.295557738678171e-06, + "loss": 0.4287, + "step": 3095 + }, + { + "epoch": 0.43422159887798034, + "grad_norm": 2.4271044968977096, + "learning_rate": 6.2933639244168535e-06, + "loss": 0.3633, + "step": 3096 + }, + { + "epoch": 0.4343618513323983, + "grad_norm": 3.7555400615450276, + "learning_rate": 6.291169843278927e-06, + "loss": 0.366, + "step": 3097 + }, + { + "epoch": 0.43450210378681625, + "grad_norm": 2.2663304776159086, + "learning_rate": 6.288975495717124e-06, + "loss": 0.3689, + "step": 3098 + }, + { + "epoch": 0.4346423562412342, + "grad_norm": 2.886574871932473, + "learning_rate": 6.286780882184233e-06, + "loss": 0.3859, + "step": 3099 + }, + { + "epoch": 0.43478260869565216, + "grad_norm": 2.4017684619869413, + "learning_rate": 6.284586003133096e-06, + "loss": 0.4306, + "step": 3100 + }, + { + "epoch": 0.4349228611500701, + "grad_norm": 2.194044622650793, + "learning_rate": 6.282390859016613e-06, + "loss": 0.3833, + "step": 3101 + }, + { + "epoch": 0.43506311360448807, + "grad_norm": 2.837959010435082, + "learning_rate": 6.280195450287736e-06, + "loss": 0.3677, + "step": 3102 + }, + { + "epoch": 0.435203366058906, + "grad_norm": 1.804763396017228, + "learning_rate": 6.277999777399473e-06, + "loss": 0.3379, + "step": 3103 + }, + { + "epoch": 0.435343618513324, + "grad_norm": 5.791890348144692, + "learning_rate": 6.2758038408048825e-06, + "loss": 0.3717, + "step": 3104 + }, + { + "epoch": 0.43548387096774194, + "grad_norm": 2.085206778904916, + "learning_rate": 6.273607640957085e-06, + "loss": 0.3592, + "step": 3105 + }, + { + "epoch": 0.4356241234221599, + "grad_norm": 2.217525412877289, + "learning_rate": 6.271411178309247e-06, + "loss": 0.3905, + "step": 3106 + }, + { + "epoch": 0.43576437587657785, + "grad_norm": 2.4927270000400195, + "learning_rate": 6.269214453314596e-06, + "loss": 0.3468, + "step": 3107 + }, + { + "epoch": 0.4359046283309958, + "grad_norm": 2.0800766349393136, + "learning_rate": 6.267017466426411e-06, + "loss": 0.3826, + "step": 3108 + }, + { + "epoch": 0.43604488078541376, + "grad_norm": 1.8737999572608952, + "learning_rate": 6.264820218098022e-06, + "loss": 0.3359, + "step": 3109 + }, + { + "epoch": 0.4361851332398317, + "grad_norm": 2.1993587036049402, + "learning_rate": 6.262622708782818e-06, + "loss": 0.3751, + "step": 3110 + }, + { + "epoch": 0.43632538569424967, + "grad_norm": 1.914893456525548, + "learning_rate": 6.260424938934241e-06, + "loss": 0.3594, + "step": 3111 + }, + { + "epoch": 0.4364656381486676, + "grad_norm": 1.7085917517742373, + "learning_rate": 6.258226909005783e-06, + "loss": 0.3283, + "step": 3112 + }, + { + "epoch": 0.4366058906030856, + "grad_norm": 2.7808260648146774, + "learning_rate": 6.256028619450993e-06, + "loss": 0.347, + "step": 3113 + }, + { + "epoch": 0.43674614305750353, + "grad_norm": 2.1780719854615778, + "learning_rate": 6.253830070723472e-06, + "loss": 0.4053, + "step": 3114 + }, + { + "epoch": 0.43688639551192143, + "grad_norm": 1.7256471442791652, + "learning_rate": 6.251631263276877e-06, + "loss": 0.3547, + "step": 3115 + }, + { + "epoch": 0.4370266479663394, + "grad_norm": 2.063718378112415, + "learning_rate": 6.2494321975649155e-06, + "loss": 0.4002, + "step": 3116 + }, + { + "epoch": 0.43716690042075734, + "grad_norm": 2.5092678897558574, + "learning_rate": 6.247232874041348e-06, + "loss": 0.399, + "step": 3117 + }, + { + "epoch": 0.4373071528751753, + "grad_norm": 4.5309753334722735, + "learning_rate": 6.2450332931599926e-06, + "loss": 0.3513, + "step": 3118 + }, + { + "epoch": 0.43744740532959325, + "grad_norm": 2.0605368085136218, + "learning_rate": 6.2428334553747135e-06, + "loss": 0.3814, + "step": 3119 + }, + { + "epoch": 0.4375876577840112, + "grad_norm": 1.8861750417569874, + "learning_rate": 6.240633361139435e-06, + "loss": 0.3613, + "step": 3120 + }, + { + "epoch": 0.43772791023842916, + "grad_norm": 1.9643117135048858, + "learning_rate": 6.238433010908131e-06, + "loss": 0.368, + "step": 3121 + }, + { + "epoch": 0.4378681626928471, + "grad_norm": 3.468365209733309, + "learning_rate": 6.236232405134827e-06, + "loss": 0.3747, + "step": 3122 + }, + { + "epoch": 0.4380084151472651, + "grad_norm": 2.1193137665083635, + "learning_rate": 6.234031544273602e-06, + "loss": 0.3932, + "step": 3123 + }, + { + "epoch": 0.43814866760168303, + "grad_norm": 2.70094811429153, + "learning_rate": 6.23183042877859e-06, + "loss": 0.4386, + "step": 3124 + }, + { + "epoch": 0.438288920056101, + "grad_norm": 1.5011661288220675, + "learning_rate": 6.229629059103975e-06, + "loss": 0.3572, + "step": 3125 + }, + { + "epoch": 0.43842917251051894, + "grad_norm": 2.4942781773382947, + "learning_rate": 6.227427435703997e-06, + "loss": 0.3713, + "step": 3126 + }, + { + "epoch": 0.4385694249649369, + "grad_norm": 2.014961665808795, + "learning_rate": 6.225225559032941e-06, + "loss": 0.3816, + "step": 3127 + }, + { + "epoch": 0.43870967741935485, + "grad_norm": 2.5368756545602387, + "learning_rate": 6.223023429545152e-06, + "loss": 0.3384, + "step": 3128 + }, + { + "epoch": 0.4388499298737728, + "grad_norm": 1.8690475073951176, + "learning_rate": 6.2208210476950215e-06, + "loss": 0.376, + "step": 3129 + }, + { + "epoch": 0.43899018232819076, + "grad_norm": 2.0655791677217845, + "learning_rate": 6.218618413936999e-06, + "loss": 0.3723, + "step": 3130 + }, + { + "epoch": 0.4391304347826087, + "grad_norm": 2.653944563343457, + "learning_rate": 6.216415528725579e-06, + "loss": 0.3617, + "step": 3131 + }, + { + "epoch": 0.43927068723702667, + "grad_norm": 2.4351836716078177, + "learning_rate": 6.2142123925153135e-06, + "loss": 0.4117, + "step": 3132 + }, + { + "epoch": 0.4394109396914446, + "grad_norm": 1.7904881956025063, + "learning_rate": 6.212009005760805e-06, + "loss": 0.3873, + "step": 3133 + }, + { + "epoch": 0.4395511921458626, + "grad_norm": 2.9173768041860035, + "learning_rate": 6.209805368916705e-06, + "loss": 0.3762, + "step": 3134 + }, + { + "epoch": 0.4396914446002805, + "grad_norm": 2.36465288191589, + "learning_rate": 6.207601482437719e-06, + "loss": 0.3669, + "step": 3135 + }, + { + "epoch": 0.43983169705469843, + "grad_norm": 2.1025277229141497, + "learning_rate": 6.2053973467786065e-06, + "loss": 0.4253, + "step": 3136 + }, + { + "epoch": 0.4399719495091164, + "grad_norm": 3.1177516034852997, + "learning_rate": 6.203192962394171e-06, + "loss": 0.3776, + "step": 3137 + }, + { + "epoch": 0.44011220196353434, + "grad_norm": 2.5542018790460004, + "learning_rate": 6.200988329739275e-06, + "loss": 0.3614, + "step": 3138 + }, + { + "epoch": 0.4402524544179523, + "grad_norm": 2.010436247366258, + "learning_rate": 6.198783449268827e-06, + "loss": 0.3418, + "step": 3139 + }, + { + "epoch": 0.44039270687237025, + "grad_norm": 1.915634132848877, + "learning_rate": 6.1965783214377895e-06, + "loss": 0.3876, + "step": 3140 + }, + { + "epoch": 0.4405329593267882, + "grad_norm": 2.0789454855329006, + "learning_rate": 6.194372946701176e-06, + "loss": 0.371, + "step": 3141 + }, + { + "epoch": 0.44067321178120616, + "grad_norm": 1.943198652666048, + "learning_rate": 6.192167325514049e-06, + "loss": 0.3923, + "step": 3142 + }, + { + "epoch": 0.4408134642356241, + "grad_norm": 2.4950206760321896, + "learning_rate": 6.189961458331523e-06, + "loss": 0.3757, + "step": 3143 + }, + { + "epoch": 0.4409537166900421, + "grad_norm": 1.8742401490733025, + "learning_rate": 6.1877553456087655e-06, + "loss": 0.3563, + "step": 3144 + }, + { + "epoch": 0.44109396914446003, + "grad_norm": 2.412410430036062, + "learning_rate": 6.1855489878009885e-06, + "loss": 0.3976, + "step": 3145 + }, + { + "epoch": 0.441234221598878, + "grad_norm": 1.6552014180497685, + "learning_rate": 6.183342385363462e-06, + "loss": 0.3718, + "step": 3146 + }, + { + "epoch": 0.44137447405329594, + "grad_norm": 2.472184219020633, + "learning_rate": 6.181135538751504e-06, + "loss": 0.3734, + "step": 3147 + }, + { + "epoch": 0.4415147265077139, + "grad_norm": 1.8066278240678253, + "learning_rate": 6.178928448420476e-06, + "loss": 0.3548, + "step": 3148 + }, + { + "epoch": 0.44165497896213185, + "grad_norm": 2.868216597556832, + "learning_rate": 6.176721114825802e-06, + "loss": 0.3595, + "step": 3149 + }, + { + "epoch": 0.4417952314165498, + "grad_norm": 2.332550042240132, + "learning_rate": 6.174513538422946e-06, + "loss": 0.357, + "step": 3150 + }, + { + "epoch": 0.44193548387096776, + "grad_norm": 2.5689158253850626, + "learning_rate": 6.172305719667427e-06, + "loss": 0.3589, + "step": 3151 + }, + { + "epoch": 0.4420757363253857, + "grad_norm": 2.272913082503428, + "learning_rate": 6.170097659014812e-06, + "loss": 0.3328, + "step": 3152 + }, + { + "epoch": 0.44221598877980367, + "grad_norm": 2.108863599031784, + "learning_rate": 6.167889356920722e-06, + "loss": 0.3446, + "step": 3153 + }, + { + "epoch": 0.4423562412342216, + "grad_norm": 3.0872897118990554, + "learning_rate": 6.165680813840822e-06, + "loss": 0.3452, + "step": 3154 + }, + { + "epoch": 0.4424964936886395, + "grad_norm": 2.235750002033823, + "learning_rate": 6.163472030230831e-06, + "loss": 0.3674, + "step": 3155 + }, + { + "epoch": 0.4426367461430575, + "grad_norm": 2.295988054364062, + "learning_rate": 6.161263006546513e-06, + "loss": 0.402, + "step": 3156 + }, + { + "epoch": 0.44277699859747544, + "grad_norm": 2.0929471654515854, + "learning_rate": 6.159053743243689e-06, + "loss": 0.3756, + "step": 3157 + }, + { + "epoch": 0.4429172510518934, + "grad_norm": 2.321635065221947, + "learning_rate": 6.156844240778221e-06, + "loss": 0.349, + "step": 3158 + }, + { + "epoch": 0.44305750350631135, + "grad_norm": 2.719119255158404, + "learning_rate": 6.1546344996060294e-06, + "loss": 0.3586, + "step": 3159 + }, + { + "epoch": 0.4431977559607293, + "grad_norm": 2.2467076267089383, + "learning_rate": 6.152424520183072e-06, + "loss": 0.3601, + "step": 3160 + }, + { + "epoch": 0.44333800841514726, + "grad_norm": 5.078444814147996, + "learning_rate": 6.150214302965368e-06, + "loss": 0.3603, + "step": 3161 + }, + { + "epoch": 0.4434782608695652, + "grad_norm": 2.7884946613722366, + "learning_rate": 6.148003848408979e-06, + "loss": 0.3826, + "step": 3162 + }, + { + "epoch": 0.44361851332398317, + "grad_norm": 2.202152984682112, + "learning_rate": 6.145793156970017e-06, + "loss": 0.3122, + "step": 3163 + }, + { + "epoch": 0.4437587657784011, + "grad_norm": 2.5317235433063074, + "learning_rate": 6.143582229104641e-06, + "loss": 0.382, + "step": 3164 + }, + { + "epoch": 0.4438990182328191, + "grad_norm": 2.4112428660902743, + "learning_rate": 6.141371065269061e-06, + "loss": 0.403, + "step": 3165 + }, + { + "epoch": 0.44403927068723703, + "grad_norm": 3.437184406011725, + "learning_rate": 6.1391596659195366e-06, + "loss": 0.3423, + "step": 3166 + }, + { + "epoch": 0.444179523141655, + "grad_norm": 2.384027090225938, + "learning_rate": 6.136948031512375e-06, + "loss": 0.3771, + "step": 3167 + }, + { + "epoch": 0.44431977559607294, + "grad_norm": 1.8979953831896892, + "learning_rate": 6.134736162503929e-06, + "loss": 0.3486, + "step": 3168 + }, + { + "epoch": 0.4444600280504909, + "grad_norm": 3.7057591336777325, + "learning_rate": 6.132524059350607e-06, + "loss": 0.3708, + "step": 3169 + }, + { + "epoch": 0.44460028050490885, + "grad_norm": 1.99404198592203, + "learning_rate": 6.130311722508854e-06, + "loss": 0.3698, + "step": 3170 + }, + { + "epoch": 0.4447405329593268, + "grad_norm": 2.089432092925555, + "learning_rate": 6.128099152435175e-06, + "loss": 0.3076, + "step": 3171 + }, + { + "epoch": 0.44488078541374476, + "grad_norm": 2.2652376041037403, + "learning_rate": 6.125886349586117e-06, + "loss": 0.3843, + "step": 3172 + }, + { + "epoch": 0.4450210378681627, + "grad_norm": 1.817629625716848, + "learning_rate": 6.123673314418277e-06, + "loss": 0.3865, + "step": 3173 + }, + { + "epoch": 0.44516129032258067, + "grad_norm": 2.3003070128823295, + "learning_rate": 6.121460047388301e-06, + "loss": 0.3545, + "step": 3174 + }, + { + "epoch": 0.44530154277699857, + "grad_norm": 3.1265590593356256, + "learning_rate": 6.119246548952877e-06, + "loss": 0.381, + "step": 3175 + }, + { + "epoch": 0.4454417952314165, + "grad_norm": 2.597207304514465, + "learning_rate": 6.117032819568749e-06, + "loss": 0.4274, + "step": 3176 + }, + { + "epoch": 0.4455820476858345, + "grad_norm": 5.478136998112841, + "learning_rate": 6.114818859692701e-06, + "loss": 0.3614, + "step": 3177 + }, + { + "epoch": 0.44572230014025244, + "grad_norm": 2.1964999751093823, + "learning_rate": 6.112604669781572e-06, + "loss": 0.3703, + "step": 3178 + }, + { + "epoch": 0.4458625525946704, + "grad_norm": 2.591953712668207, + "learning_rate": 6.110390250292244e-06, + "loss": 0.3462, + "step": 3179 + }, + { + "epoch": 0.44600280504908835, + "grad_norm": 2.5600698159055724, + "learning_rate": 6.108175601681643e-06, + "loss": 0.3505, + "step": 3180 + }, + { + "epoch": 0.4461430575035063, + "grad_norm": 2.718778851235431, + "learning_rate": 6.1059607244067485e-06, + "loss": 0.3362, + "step": 3181 + }, + { + "epoch": 0.44628330995792426, + "grad_norm": 2.580673253094917, + "learning_rate": 6.103745618924587e-06, + "loss": 0.3338, + "step": 3182 + }, + { + "epoch": 0.4464235624123422, + "grad_norm": 3.5236694530254353, + "learning_rate": 6.101530285692228e-06, + "loss": 0.3945, + "step": 3183 + }, + { + "epoch": 0.44656381486676017, + "grad_norm": 2.38945786651994, + "learning_rate": 6.09931472516679e-06, + "loss": 0.3596, + "step": 3184 + }, + { + "epoch": 0.4467040673211781, + "grad_norm": 1.5620214110798465, + "learning_rate": 6.097098937805439e-06, + "loss": 0.3664, + "step": 3185 + }, + { + "epoch": 0.4468443197755961, + "grad_norm": 1.8800200969555259, + "learning_rate": 6.094882924065387e-06, + "loss": 0.3533, + "step": 3186 + }, + { + "epoch": 0.44698457223001403, + "grad_norm": 1.897013754381057, + "learning_rate": 6.092666684403893e-06, + "loss": 0.3512, + "step": 3187 + }, + { + "epoch": 0.447124824684432, + "grad_norm": 2.0445609953847983, + "learning_rate": 6.090450219278264e-06, + "loss": 0.3314, + "step": 3188 + }, + { + "epoch": 0.44726507713884994, + "grad_norm": 3.0185093501427844, + "learning_rate": 6.088233529145849e-06, + "loss": 0.3504, + "step": 3189 + }, + { + "epoch": 0.4474053295932679, + "grad_norm": 1.8722599353381495, + "learning_rate": 6.08601661446405e-06, + "loss": 0.3604, + "step": 3190 + }, + { + "epoch": 0.44754558204768585, + "grad_norm": 2.0700753705958412, + "learning_rate": 6.08379947569031e-06, + "loss": 0.3827, + "step": 3191 + }, + { + "epoch": 0.4476858345021038, + "grad_norm": 2.4587920994686443, + "learning_rate": 6.081582113282118e-06, + "loss": 0.3639, + "step": 3192 + }, + { + "epoch": 0.44782608695652176, + "grad_norm": 1.998231582829719, + "learning_rate": 6.0793645276970145e-06, + "loss": 0.4057, + "step": 3193 + }, + { + "epoch": 0.4479663394109397, + "grad_norm": 4.129201425559402, + "learning_rate": 6.077146719392582e-06, + "loss": 0.3539, + "step": 3194 + }, + { + "epoch": 0.4481065918653576, + "grad_norm": 2.300687260157076, + "learning_rate": 6.07492868882645e-06, + "loss": 0.3819, + "step": 3195 + }, + { + "epoch": 0.4482468443197756, + "grad_norm": 2.4248465735281823, + "learning_rate": 6.072710436456293e-06, + "loss": 0.3839, + "step": 3196 + }, + { + "epoch": 0.4483870967741935, + "grad_norm": 2.1497907226197652, + "learning_rate": 6.070491962739831e-06, + "loss": 0.377, + "step": 3197 + }, + { + "epoch": 0.4485273492286115, + "grad_norm": 2.6192772564392364, + "learning_rate": 6.068273268134832e-06, + "loss": 0.3142, + "step": 3198 + }, + { + "epoch": 0.44866760168302944, + "grad_norm": 2.0445735893486354, + "learning_rate": 6.066054353099109e-06, + "loss": 0.3713, + "step": 3199 + }, + { + "epoch": 0.4488078541374474, + "grad_norm": 1.8651355018174338, + "learning_rate": 6.063835218090517e-06, + "loss": 0.3405, + "step": 3200 + }, + { + "epoch": 0.44894810659186535, + "grad_norm": 1.9812117467610952, + "learning_rate": 6.061615863566961e-06, + "loss": 0.386, + "step": 3201 + }, + { + "epoch": 0.4490883590462833, + "grad_norm": 2.090148903502393, + "learning_rate": 6.059396289986386e-06, + "loss": 0.3842, + "step": 3202 + }, + { + "epoch": 0.44922861150070126, + "grad_norm": 2.192210112908474, + "learning_rate": 6.057176497806791e-06, + "loss": 0.3693, + "step": 3203 + }, + { + "epoch": 0.4493688639551192, + "grad_norm": 2.4998645745790613, + "learning_rate": 6.054956487486212e-06, + "loss": 0.3648, + "step": 3204 + }, + { + "epoch": 0.44950911640953717, + "grad_norm": 1.9365350720031556, + "learning_rate": 6.05273625948273e-06, + "loss": 0.3431, + "step": 3205 + }, + { + "epoch": 0.4496493688639551, + "grad_norm": 2.1014495025093227, + "learning_rate": 6.050515814254477e-06, + "loss": 0.3537, + "step": 3206 + }, + { + "epoch": 0.4497896213183731, + "grad_norm": 2.9500370734519947, + "learning_rate": 6.0482951522596245e-06, + "loss": 0.3943, + "step": 3207 + }, + { + "epoch": 0.44992987377279103, + "grad_norm": 2.245159133844503, + "learning_rate": 6.046074273956392e-06, + "loss": 0.3545, + "step": 3208 + }, + { + "epoch": 0.450070126227209, + "grad_norm": 5.0542825952344845, + "learning_rate": 6.043853179803042e-06, + "loss": 0.3886, + "step": 3209 + }, + { + "epoch": 0.45021037868162694, + "grad_norm": 1.9314116524505913, + "learning_rate": 6.041631870257882e-06, + "loss": 0.3453, + "step": 3210 + }, + { + "epoch": 0.4503506311360449, + "grad_norm": 2.0703286224313278, + "learning_rate": 6.039410345779262e-06, + "loss": 0.3513, + "step": 3211 + }, + { + "epoch": 0.45049088359046285, + "grad_norm": 1.9566098062072412, + "learning_rate": 6.037188606825578e-06, + "loss": 0.3484, + "step": 3212 + }, + { + "epoch": 0.4506311360448808, + "grad_norm": 1.8968896403368298, + "learning_rate": 6.034966653855272e-06, + "loss": 0.3608, + "step": 3213 + }, + { + "epoch": 0.45077138849929876, + "grad_norm": 2.3166707548841816, + "learning_rate": 6.032744487326827e-06, + "loss": 0.3397, + "step": 3214 + }, + { + "epoch": 0.45091164095371666, + "grad_norm": 3.0505120894988615, + "learning_rate": 6.030522107698775e-06, + "loss": 0.3851, + "step": 3215 + }, + { + "epoch": 0.4510518934081346, + "grad_norm": 2.2994873346260714, + "learning_rate": 6.028299515429683e-06, + "loss": 0.3833, + "step": 3216 + }, + { + "epoch": 0.4511921458625526, + "grad_norm": 2.173624774644811, + "learning_rate": 6.026076710978172e-06, + "loss": 0.3976, + "step": 3217 + }, + { + "epoch": 0.45133239831697053, + "grad_norm": 2.659105359403506, + "learning_rate": 6.023853694802899e-06, + "loss": 0.3805, + "step": 3218 + }, + { + "epoch": 0.4514726507713885, + "grad_norm": 1.9559256653458905, + "learning_rate": 6.021630467362571e-06, + "loss": 0.3572, + "step": 3219 + }, + { + "epoch": 0.45161290322580644, + "grad_norm": 1.9863725355316788, + "learning_rate": 6.0194070291159346e-06, + "loss": 0.3404, + "step": 3220 + }, + { + "epoch": 0.4517531556802244, + "grad_norm": 2.4565558284391837, + "learning_rate": 6.017183380521777e-06, + "loss": 0.3639, + "step": 3221 + }, + { + "epoch": 0.45189340813464235, + "grad_norm": 1.7592824935308335, + "learning_rate": 6.014959522038937e-06, + "loss": 0.3115, + "step": 3222 + }, + { + "epoch": 0.4520336605890603, + "grad_norm": 2.2539731020739904, + "learning_rate": 6.012735454126289e-06, + "loss": 0.3586, + "step": 3223 + }, + { + "epoch": 0.45217391304347826, + "grad_norm": 2.2256324297880847, + "learning_rate": 6.010511177242757e-06, + "loss": 0.3765, + "step": 3224 + }, + { + "epoch": 0.4523141654978962, + "grad_norm": 2.038525039171005, + "learning_rate": 6.008286691847305e-06, + "loss": 0.307, + "step": 3225 + }, + { + "epoch": 0.45245441795231417, + "grad_norm": 2.0037489563480544, + "learning_rate": 6.006061998398937e-06, + "loss": 0.3606, + "step": 3226 + }, + { + "epoch": 0.4525946704067321, + "grad_norm": 4.9308927728137935, + "learning_rate": 6.003837097356704e-06, + "loss": 0.3522, + "step": 3227 + }, + { + "epoch": 0.4527349228611501, + "grad_norm": 2.296342852962521, + "learning_rate": 6.0016119891797e-06, + "loss": 0.3778, + "step": 3228 + }, + { + "epoch": 0.45287517531556803, + "grad_norm": 2.4191128638479578, + "learning_rate": 5.999386674327059e-06, + "loss": 0.3674, + "step": 3229 + }, + { + "epoch": 0.453015427769986, + "grad_norm": 2.405635606678439, + "learning_rate": 5.997161153257963e-06, + "loss": 0.337, + "step": 3230 + }, + { + "epoch": 0.45315568022440395, + "grad_norm": 2.820683927725578, + "learning_rate": 5.994935426431627e-06, + "loss": 0.3826, + "step": 3231 + }, + { + "epoch": 0.4532959326788219, + "grad_norm": 2.048661362486077, + "learning_rate": 5.992709494307317e-06, + "loss": 0.3206, + "step": 3232 + }, + { + "epoch": 0.45343618513323986, + "grad_norm": 2.0968902723776677, + "learning_rate": 5.9904833573443385e-06, + "loss": 0.3615, + "step": 3233 + }, + { + "epoch": 0.4535764375876578, + "grad_norm": 2.0925976443096888, + "learning_rate": 5.9882570160020395e-06, + "loss": 0.4131, + "step": 3234 + }, + { + "epoch": 0.4537166900420757, + "grad_norm": 1.9800916685212002, + "learning_rate": 5.986030470739811e-06, + "loss": 0.3396, + "step": 3235 + }, + { + "epoch": 0.45385694249649366, + "grad_norm": 1.996309809926368, + "learning_rate": 5.983803722017083e-06, + "loss": 0.3675, + "step": 3236 + }, + { + "epoch": 0.4539971949509116, + "grad_norm": 2.26499235594009, + "learning_rate": 5.981576770293329e-06, + "loss": 0.3618, + "step": 3237 + }, + { + "epoch": 0.4541374474053296, + "grad_norm": 2.2277240065979345, + "learning_rate": 5.979349616028067e-06, + "loss": 0.3764, + "step": 3238 + }, + { + "epoch": 0.45427769985974753, + "grad_norm": 2.2412833806715216, + "learning_rate": 5.977122259680854e-06, + "loss": 0.3784, + "step": 3239 + }, + { + "epoch": 0.4544179523141655, + "grad_norm": 1.7765281734320806, + "learning_rate": 5.974894701711291e-06, + "loss": 0.3448, + "step": 3240 + }, + { + "epoch": 0.45455820476858344, + "grad_norm": 2.0283248967434355, + "learning_rate": 5.9726669425790175e-06, + "loss": 0.3953, + "step": 3241 + }, + { + "epoch": 0.4546984572230014, + "grad_norm": 1.721646036478072, + "learning_rate": 5.970438982743715e-06, + "loss": 0.3801, + "step": 3242 + }, + { + "epoch": 0.45483870967741935, + "grad_norm": 2.0502169750108896, + "learning_rate": 5.9682108226651084e-06, + "loss": 0.3713, + "step": 3243 + }, + { + "epoch": 0.4549789621318373, + "grad_norm": 2.258007318348002, + "learning_rate": 5.965982462802962e-06, + "loss": 0.3648, + "step": 3244 + }, + { + "epoch": 0.45511921458625526, + "grad_norm": 2.885161809763547, + "learning_rate": 5.963753903617084e-06, + "loss": 0.3749, + "step": 3245 + }, + { + "epoch": 0.4552594670406732, + "grad_norm": 3.2923500864575255, + "learning_rate": 5.961525145567322e-06, + "loss": 0.411, + "step": 3246 + }, + { + "epoch": 0.45539971949509117, + "grad_norm": 1.8702406561110336, + "learning_rate": 5.959296189113563e-06, + "loss": 0.3308, + "step": 3247 + }, + { + "epoch": 0.4555399719495091, + "grad_norm": 2.058069730654273, + "learning_rate": 5.9570670347157375e-06, + "loss": 0.3714, + "step": 3248 + }, + { + "epoch": 0.4556802244039271, + "grad_norm": 1.7925590096820072, + "learning_rate": 5.954837682833816e-06, + "loss": 0.3584, + "step": 3249 + }, + { + "epoch": 0.45582047685834504, + "grad_norm": 2.027971174287306, + "learning_rate": 5.95260813392781e-06, + "loss": 0.3622, + "step": 3250 + }, + { + "epoch": 0.455960729312763, + "grad_norm": 2.189689956909735, + "learning_rate": 5.950378388457774e-06, + "loss": 0.3396, + "step": 3251 + }, + { + "epoch": 0.45610098176718095, + "grad_norm": 1.9452521379903005, + "learning_rate": 5.948148446883794e-06, + "loss": 0.3721, + "step": 3252 + }, + { + "epoch": 0.4562412342215989, + "grad_norm": 2.15293961877132, + "learning_rate": 5.945918309666005e-06, + "loss": 0.3749, + "step": 3253 + }, + { + "epoch": 0.45638148667601686, + "grad_norm": 2.0038506870114303, + "learning_rate": 5.943687977264584e-06, + "loss": 0.3247, + "step": 3254 + }, + { + "epoch": 0.45652173913043476, + "grad_norm": 2.0209705529047195, + "learning_rate": 5.941457450139741e-06, + "loss": 0.3809, + "step": 3255 + }, + { + "epoch": 0.4566619915848527, + "grad_norm": 2.192862776318144, + "learning_rate": 5.939226728751733e-06, + "loss": 0.3788, + "step": 3256 + }, + { + "epoch": 0.45680224403927067, + "grad_norm": 1.7903194093495989, + "learning_rate": 5.9369958135608485e-06, + "loss": 0.3242, + "step": 3257 + }, + { + "epoch": 0.4569424964936886, + "grad_norm": 2.9390605271458026, + "learning_rate": 5.934764705027425e-06, + "loss": 0.3575, + "step": 3258 + }, + { + "epoch": 0.4570827489481066, + "grad_norm": 1.9489514123605827, + "learning_rate": 5.932533403611835e-06, + "loss": 0.4118, + "step": 3259 + }, + { + "epoch": 0.45722300140252453, + "grad_norm": 6.0008687344106235, + "learning_rate": 5.930301909774494e-06, + "loss": 0.3886, + "step": 3260 + }, + { + "epoch": 0.4573632538569425, + "grad_norm": 2.08134577991854, + "learning_rate": 5.928070223975853e-06, + "loss": 0.309, + "step": 3261 + }, + { + "epoch": 0.45750350631136044, + "grad_norm": 2.162594293732569, + "learning_rate": 5.925838346676405e-06, + "loss": 0.3486, + "step": 3262 + }, + { + "epoch": 0.4576437587657784, + "grad_norm": 1.9173750881207732, + "learning_rate": 5.9236062783366825e-06, + "loss": 0.4128, + "step": 3263 + }, + { + "epoch": 0.45778401122019635, + "grad_norm": 2.1152535364581087, + "learning_rate": 5.9213740194172565e-06, + "loss": 0.3415, + "step": 3264 + }, + { + "epoch": 0.4579242636746143, + "grad_norm": 1.8453330663040461, + "learning_rate": 5.919141570378739e-06, + "loss": 0.3868, + "step": 3265 + }, + { + "epoch": 0.45806451612903226, + "grad_norm": 1.7771971254337957, + "learning_rate": 5.916908931681781e-06, + "loss": 0.341, + "step": 3266 + }, + { + "epoch": 0.4582047685834502, + "grad_norm": 2.219529136004141, + "learning_rate": 5.914676103787071e-06, + "loss": 0.3448, + "step": 3267 + }, + { + "epoch": 0.45834502103786817, + "grad_norm": 2.2818127290811847, + "learning_rate": 5.912443087155336e-06, + "loss": 0.3281, + "step": 3268 + }, + { + "epoch": 0.4584852734922861, + "grad_norm": 2.213883987585131, + "learning_rate": 5.910209882247346e-06, + "loss": 0.3873, + "step": 3269 + }, + { + "epoch": 0.4586255259467041, + "grad_norm": 1.802093821882768, + "learning_rate": 5.9079764895239066e-06, + "loss": 0.346, + "step": 3270 + }, + { + "epoch": 0.45876577840112204, + "grad_norm": 2.5466901091883067, + "learning_rate": 5.905742909445863e-06, + "loss": 0.283, + "step": 3271 + }, + { + "epoch": 0.45890603085554, + "grad_norm": 2.3335159661707436, + "learning_rate": 5.903509142474095e-06, + "loss": 0.3718, + "step": 3272 + }, + { + "epoch": 0.45904628330995795, + "grad_norm": 2.0416784351035653, + "learning_rate": 5.90127518906953e-06, + "loss": 0.4019, + "step": 3273 + }, + { + "epoch": 0.4591865357643759, + "grad_norm": 2.228976256417363, + "learning_rate": 5.899041049693125e-06, + "loss": 0.3804, + "step": 3274 + }, + { + "epoch": 0.4593267882187938, + "grad_norm": 1.8191624229522616, + "learning_rate": 5.896806724805881e-06, + "loss": 0.3927, + "step": 3275 + }, + { + "epoch": 0.45946704067321176, + "grad_norm": 2.4887976477376133, + "learning_rate": 5.894572214868837e-06, + "loss": 0.4222, + "step": 3276 + }, + { + "epoch": 0.4596072931276297, + "grad_norm": 3.287435024765238, + "learning_rate": 5.8923375203430645e-06, + "loss": 0.396, + "step": 3277 + }, + { + "epoch": 0.45974754558204767, + "grad_norm": 2.8422171685010915, + "learning_rate": 5.890102641689679e-06, + "loss": 0.358, + "step": 3278 + }, + { + "epoch": 0.4598877980364656, + "grad_norm": 2.2139045567465314, + "learning_rate": 5.887867579369833e-06, + "loss": 0.3446, + "step": 3279 + }, + { + "epoch": 0.4600280504908836, + "grad_norm": 1.5319630266762803, + "learning_rate": 5.885632333844714e-06, + "loss": 0.3255, + "step": 3280 + }, + { + "epoch": 0.46016830294530153, + "grad_norm": 3.415389093487316, + "learning_rate": 5.883396905575552e-06, + "loss": 0.3676, + "step": 3281 + }, + { + "epoch": 0.4603085553997195, + "grad_norm": 2.042504923521572, + "learning_rate": 5.88116129502361e-06, + "loss": 0.3608, + "step": 3282 + }, + { + "epoch": 0.46044880785413744, + "grad_norm": 2.049226531211399, + "learning_rate": 5.87892550265019e-06, + "loss": 0.352, + "step": 3283 + }, + { + "epoch": 0.4605890603085554, + "grad_norm": 2.1610039535973504, + "learning_rate": 5.876689528916634e-06, + "loss": 0.3339, + "step": 3284 + }, + { + "epoch": 0.46072931276297335, + "grad_norm": 2.019556750020344, + "learning_rate": 5.874453374284318e-06, + "loss": 0.387, + "step": 3285 + }, + { + "epoch": 0.4608695652173913, + "grad_norm": 1.742825994112724, + "learning_rate": 5.872217039214659e-06, + "loss": 0.3381, + "step": 3286 + }, + { + "epoch": 0.46100981767180926, + "grad_norm": 1.7603801369874148, + "learning_rate": 5.8699805241691065e-06, + "loss": 0.3462, + "step": 3287 + }, + { + "epoch": 0.4611500701262272, + "grad_norm": 2.0988487675722145, + "learning_rate": 5.867743829609152e-06, + "loss": 0.3237, + "step": 3288 + }, + { + "epoch": 0.4612903225806452, + "grad_norm": 1.8856515334161965, + "learning_rate": 5.86550695599632e-06, + "loss": 0.3804, + "step": 3289 + }, + { + "epoch": 0.46143057503506313, + "grad_norm": 1.71656480073384, + "learning_rate": 5.863269903792174e-06, + "loss": 0.3695, + "step": 3290 + }, + { + "epoch": 0.4615708274894811, + "grad_norm": 2.19705717327195, + "learning_rate": 5.861032673458316e-06, + "loss": 0.3692, + "step": 3291 + }, + { + "epoch": 0.46171107994389904, + "grad_norm": 2.3173786971887185, + "learning_rate": 5.858795265456382e-06, + "loss": 0.389, + "step": 3292 + }, + { + "epoch": 0.461851332398317, + "grad_norm": 2.1148940611295464, + "learning_rate": 5.856557680248043e-06, + "loss": 0.3433, + "step": 3293 + }, + { + "epoch": 0.46199158485273495, + "grad_norm": 1.6152004252213574, + "learning_rate": 5.854319918295012e-06, + "loss": 0.3476, + "step": 3294 + }, + { + "epoch": 0.46213183730715285, + "grad_norm": 2.6472973014891856, + "learning_rate": 5.8520819800590345e-06, + "loss": 0.3691, + "step": 3295 + }, + { + "epoch": 0.4622720897615708, + "grad_norm": 2.026335537215173, + "learning_rate": 5.849843866001893e-06, + "loss": 0.3845, + "step": 3296 + }, + { + "epoch": 0.46241234221598876, + "grad_norm": 2.122171539125287, + "learning_rate": 5.847605576585409e-06, + "loss": 0.3165, + "step": 3297 + }, + { + "epoch": 0.4625525946704067, + "grad_norm": 1.8328678522751738, + "learning_rate": 5.845367112271434e-06, + "loss": 0.3598, + "step": 3298 + }, + { + "epoch": 0.46269284712482467, + "grad_norm": 6.689192228032785, + "learning_rate": 5.843128473521863e-06, + "loss": 0.3743, + "step": 3299 + }, + { + "epoch": 0.4628330995792426, + "grad_norm": 1.911267914672481, + "learning_rate": 5.840889660798621e-06, + "loss": 0.3616, + "step": 3300 + }, + { + "epoch": 0.4629733520336606, + "grad_norm": 1.772081840054675, + "learning_rate": 5.838650674563674e-06, + "loss": 0.4195, + "step": 3301 + }, + { + "epoch": 0.46311360448807853, + "grad_norm": 1.750050748361724, + "learning_rate": 5.836411515279018e-06, + "loss": 0.3369, + "step": 3302 + }, + { + "epoch": 0.4632538569424965, + "grad_norm": 3.2578135245708073, + "learning_rate": 5.834172183406691e-06, + "loss": 0.3728, + "step": 3303 + }, + { + "epoch": 0.46339410939691444, + "grad_norm": 2.5216723430373835, + "learning_rate": 5.831932679408761e-06, + "loss": 0.4086, + "step": 3304 + }, + { + "epoch": 0.4635343618513324, + "grad_norm": 1.7575140127434614, + "learning_rate": 5.829693003747334e-06, + "loss": 0.3579, + "step": 3305 + }, + { + "epoch": 0.46367461430575035, + "grad_norm": 1.8340550938175775, + "learning_rate": 5.827453156884553e-06, + "loss": 0.3864, + "step": 3306 + }, + { + "epoch": 0.4638148667601683, + "grad_norm": 1.860122466311265, + "learning_rate": 5.825213139282595e-06, + "loss": 0.3745, + "step": 3307 + }, + { + "epoch": 0.46395511921458626, + "grad_norm": 1.9725781717995015, + "learning_rate": 5.82297295140367e-06, + "loss": 0.3714, + "step": 3308 + }, + { + "epoch": 0.4640953716690042, + "grad_norm": 2.504362686618192, + "learning_rate": 5.820732593710027e-06, + "loss": 0.3386, + "step": 3309 + }, + { + "epoch": 0.4642356241234222, + "grad_norm": 2.666997908841355, + "learning_rate": 5.818492066663947e-06, + "loss": 0.3548, + "step": 3310 + }, + { + "epoch": 0.46437587657784013, + "grad_norm": 2.030195343878973, + "learning_rate": 5.816251370727748e-06, + "loss": 0.3548, + "step": 3311 + }, + { + "epoch": 0.4645161290322581, + "grad_norm": 2.2591310234827477, + "learning_rate": 5.814010506363781e-06, + "loss": 0.3617, + "step": 3312 + }, + { + "epoch": 0.46465638148667604, + "grad_norm": 2.0222442525611495, + "learning_rate": 5.811769474034434e-06, + "loss": 0.3739, + "step": 3313 + }, + { + "epoch": 0.464796633941094, + "grad_norm": 1.7153365485058296, + "learning_rate": 5.8095282742021265e-06, + "loss": 0.3658, + "step": 3314 + }, + { + "epoch": 0.4649368863955119, + "grad_norm": 1.8944831763391643, + "learning_rate": 5.807286907329315e-06, + "loss": 0.3341, + "step": 3315 + }, + { + "epoch": 0.46507713884992985, + "grad_norm": 2.1334005250879535, + "learning_rate": 5.8050453738784905e-06, + "loss": 0.3499, + "step": 3316 + }, + { + "epoch": 0.4652173913043478, + "grad_norm": 1.8840253413818493, + "learning_rate": 5.802803674312178e-06, + "loss": 0.365, + "step": 3317 + }, + { + "epoch": 0.46535764375876576, + "grad_norm": 1.954463652578499, + "learning_rate": 5.800561809092937e-06, + "loss": 0.3251, + "step": 3318 + }, + { + "epoch": 0.4654978962131837, + "grad_norm": 2.4611560542832622, + "learning_rate": 5.798319778683359e-06, + "loss": 0.3462, + "step": 3319 + }, + { + "epoch": 0.46563814866760167, + "grad_norm": 2.0683997189024566, + "learning_rate": 5.796077583546071e-06, + "loss": 0.3972, + "step": 3320 + }, + { + "epoch": 0.4657784011220196, + "grad_norm": 1.7039439480969512, + "learning_rate": 5.793835224143737e-06, + "loss": 0.3339, + "step": 3321 + }, + { + "epoch": 0.4659186535764376, + "grad_norm": 1.9839075694395618, + "learning_rate": 5.79159270093905e-06, + "loss": 0.3833, + "step": 3322 + }, + { + "epoch": 0.46605890603085554, + "grad_norm": 2.1598084430172424, + "learning_rate": 5.78935001439474e-06, + "loss": 0.3361, + "step": 3323 + }, + { + "epoch": 0.4661991584852735, + "grad_norm": 1.9102086721142557, + "learning_rate": 5.787107164973571e-06, + "loss": 0.3955, + "step": 3324 + }, + { + "epoch": 0.46633941093969145, + "grad_norm": 3.303319047986144, + "learning_rate": 5.784864153138335e-06, + "loss": 0.3655, + "step": 3325 + }, + { + "epoch": 0.4664796633941094, + "grad_norm": 3.623106494834591, + "learning_rate": 5.782620979351865e-06, + "loss": 0.3544, + "step": 3326 + }, + { + "epoch": 0.46661991584852736, + "grad_norm": 2.2960837941247556, + "learning_rate": 5.780377644077025e-06, + "loss": 0.3739, + "step": 3327 + }, + { + "epoch": 0.4667601683029453, + "grad_norm": 2.4266344127328043, + "learning_rate": 5.77813414777671e-06, + "loss": 0.396, + "step": 3328 + }, + { + "epoch": 0.46690042075736327, + "grad_norm": 1.9167860933773733, + "learning_rate": 5.7758904909138495e-06, + "loss": 0.3429, + "step": 3329 + }, + { + "epoch": 0.4670406732117812, + "grad_norm": 1.970371063354331, + "learning_rate": 5.773646673951406e-06, + "loss": 0.3456, + "step": 3330 + }, + { + "epoch": 0.4671809256661992, + "grad_norm": 2.556987601938278, + "learning_rate": 5.771402697352377e-06, + "loss": 0.368, + "step": 3331 + }, + { + "epoch": 0.46732117812061713, + "grad_norm": 2.1711919794582464, + "learning_rate": 5.769158561579793e-06, + "loss": 0.3621, + "step": 3332 + }, + { + "epoch": 0.4674614305750351, + "grad_norm": 1.7560727474593718, + "learning_rate": 5.766914267096712e-06, + "loss": 0.3555, + "step": 3333 + }, + { + "epoch": 0.46760168302945304, + "grad_norm": 1.7464173292354848, + "learning_rate": 5.764669814366231e-06, + "loss": 0.3221, + "step": 3334 + }, + { + "epoch": 0.46774193548387094, + "grad_norm": 2.2322115794910693, + "learning_rate": 5.762425203851475e-06, + "loss": 0.3751, + "step": 3335 + }, + { + "epoch": 0.4678821879382889, + "grad_norm": 2.2694211529341057, + "learning_rate": 5.760180436015604e-06, + "loss": 0.3624, + "step": 3336 + }, + { + "epoch": 0.46802244039270685, + "grad_norm": 2.2039002481654033, + "learning_rate": 5.7579355113218125e-06, + "loss": 0.3795, + "step": 3337 + }, + { + "epoch": 0.4681626928471248, + "grad_norm": 2.006466781930607, + "learning_rate": 5.7556904302333246e-06, + "loss": 0.4034, + "step": 3338 + }, + { + "epoch": 0.46830294530154276, + "grad_norm": 2.0005510286374535, + "learning_rate": 5.753445193213394e-06, + "loss": 0.3861, + "step": 3339 + }, + { + "epoch": 0.4684431977559607, + "grad_norm": 2.5605942687559446, + "learning_rate": 5.751199800725314e-06, + "loss": 0.3631, + "step": 3340 + }, + { + "epoch": 0.46858345021037867, + "grad_norm": 1.879051155323806, + "learning_rate": 5.748954253232401e-06, + "loss": 0.3753, + "step": 3341 + }, + { + "epoch": 0.4687237026647966, + "grad_norm": 1.6570065048219016, + "learning_rate": 5.7467085511980115e-06, + "loss": 0.365, + "step": 3342 + }, + { + "epoch": 0.4688639551192146, + "grad_norm": 2.313059043325296, + "learning_rate": 5.74446269508553e-06, + "loss": 0.4197, + "step": 3343 + }, + { + "epoch": 0.46900420757363254, + "grad_norm": 1.7720323280847174, + "learning_rate": 5.742216685358373e-06, + "loss": 0.3931, + "step": 3344 + }, + { + "epoch": 0.4691444600280505, + "grad_norm": 1.6759847593001471, + "learning_rate": 5.739970522479986e-06, + "loss": 0.2938, + "step": 3345 + }, + { + "epoch": 0.46928471248246845, + "grad_norm": 3.2631461486540236, + "learning_rate": 5.737724206913853e-06, + "loss": 0.3444, + "step": 3346 + }, + { + "epoch": 0.4694249649368864, + "grad_norm": 1.9549225736735016, + "learning_rate": 5.735477739123484e-06, + "loss": 0.4205, + "step": 3347 + }, + { + "epoch": 0.46956521739130436, + "grad_norm": 2.015764571687795, + "learning_rate": 5.7332311195724235e-06, + "loss": 0.3939, + "step": 3348 + }, + { + "epoch": 0.4697054698457223, + "grad_norm": 2.1747607263953275, + "learning_rate": 5.730984348724242e-06, + "loss": 0.3842, + "step": 3349 + }, + { + "epoch": 0.46984572230014027, + "grad_norm": 3.6645903054684834, + "learning_rate": 5.7287374270425475e-06, + "loss": 0.3668, + "step": 3350 + }, + { + "epoch": 0.4699859747545582, + "grad_norm": 2.122060200807133, + "learning_rate": 5.7264903549909765e-06, + "loss": 0.3251, + "step": 3351 + }, + { + "epoch": 0.4701262272089762, + "grad_norm": 1.7407277239582615, + "learning_rate": 5.724243133033197e-06, + "loss": 0.3515, + "step": 3352 + }, + { + "epoch": 0.47026647966339413, + "grad_norm": 2.566781351904343, + "learning_rate": 5.721995761632907e-06, + "loss": 0.3688, + "step": 3353 + }, + { + "epoch": 0.4704067321178121, + "grad_norm": 2.8823840642212755, + "learning_rate": 5.719748241253835e-06, + "loss": 0.3657, + "step": 3354 + }, + { + "epoch": 0.47054698457223, + "grad_norm": 1.6688280237366695, + "learning_rate": 5.717500572359743e-06, + "loss": 0.372, + "step": 3355 + }, + { + "epoch": 0.47068723702664794, + "grad_norm": 2.09412380696328, + "learning_rate": 5.71525275541442e-06, + "loss": 0.3726, + "step": 3356 + }, + { + "epoch": 0.4708274894810659, + "grad_norm": 2.5241915399990997, + "learning_rate": 5.7130047908816884e-06, + "loss": 0.3899, + "step": 3357 + }, + { + "epoch": 0.47096774193548385, + "grad_norm": 1.7718955575332396, + "learning_rate": 5.7107566792254e-06, + "loss": 0.3203, + "step": 3358 + }, + { + "epoch": 0.4711079943899018, + "grad_norm": 2.283924677306611, + "learning_rate": 5.7085084209094365e-06, + "loss": 0.3634, + "step": 3359 + }, + { + "epoch": 0.47124824684431976, + "grad_norm": 2.0808053764611127, + "learning_rate": 5.70626001639771e-06, + "loss": 0.4106, + "step": 3360 + }, + { + "epoch": 0.4713884992987377, + "grad_norm": 2.374983737287557, + "learning_rate": 5.704011466154162e-06, + "loss": 0.3463, + "step": 3361 + }, + { + "epoch": 0.4715287517531557, + "grad_norm": 2.022873847840879, + "learning_rate": 5.701762770642768e-06, + "loss": 0.3849, + "step": 3362 + }, + { + "epoch": 0.47166900420757363, + "grad_norm": 1.6075870989849208, + "learning_rate": 5.6995139303275304e-06, + "loss": 0.3378, + "step": 3363 + }, + { + "epoch": 0.4718092566619916, + "grad_norm": 2.577531688506492, + "learning_rate": 5.69726494567248e-06, + "loss": 0.3853, + "step": 3364 + }, + { + "epoch": 0.47194950911640954, + "grad_norm": 1.768815794890373, + "learning_rate": 5.69501581714168e-06, + "loss": 0.379, + "step": 3365 + }, + { + "epoch": 0.4720897615708275, + "grad_norm": 2.185680940963181, + "learning_rate": 5.69276654519922e-06, + "loss": 0.387, + "step": 3366 + }, + { + "epoch": 0.47223001402524545, + "grad_norm": 1.7264637638358167, + "learning_rate": 5.690517130309223e-06, + "loss": 0.298, + "step": 3367 + }, + { + "epoch": 0.4723702664796634, + "grad_norm": 1.8070506027643298, + "learning_rate": 5.688267572935843e-06, + "loss": 0.3442, + "step": 3368 + }, + { + "epoch": 0.47251051893408136, + "grad_norm": 2.3430635591450786, + "learning_rate": 5.686017873543256e-06, + "loss": 0.4104, + "step": 3369 + }, + { + "epoch": 0.4726507713884993, + "grad_norm": 2.619387302829446, + "learning_rate": 5.683768032595673e-06, + "loss": 0.3674, + "step": 3370 + }, + { + "epoch": 0.47279102384291727, + "grad_norm": 2.38228013392099, + "learning_rate": 5.681518050557336e-06, + "loss": 0.4098, + "step": 3371 + }, + { + "epoch": 0.4729312762973352, + "grad_norm": 2.0181962992955453, + "learning_rate": 5.679267927892509e-06, + "loss": 0.3849, + "step": 3372 + }, + { + "epoch": 0.4730715287517532, + "grad_norm": 1.5752061345674044, + "learning_rate": 5.677017665065492e-06, + "loss": 0.3858, + "step": 3373 + }, + { + "epoch": 0.47321178120617113, + "grad_norm": 2.42823488043311, + "learning_rate": 5.674767262540609e-06, + "loss": 0.3726, + "step": 3374 + }, + { + "epoch": 0.47335203366058903, + "grad_norm": 1.8587451156418637, + "learning_rate": 5.672516720782216e-06, + "loss": 0.4061, + "step": 3375 + }, + { + "epoch": 0.473492286115007, + "grad_norm": 1.7988537715114945, + "learning_rate": 5.670266040254697e-06, + "loss": 0.3522, + "step": 3376 + }, + { + "epoch": 0.47363253856942494, + "grad_norm": 2.201632163013018, + "learning_rate": 5.668015221422463e-06, + "loss": 0.3503, + "step": 3377 + }, + { + "epoch": 0.4737727910238429, + "grad_norm": 1.8931766661894756, + "learning_rate": 5.6657642647499545e-06, + "loss": 0.3321, + "step": 3378 + }, + { + "epoch": 0.47391304347826085, + "grad_norm": 2.390712861085868, + "learning_rate": 5.6635131707016425e-06, + "loss": 0.3922, + "step": 3379 + }, + { + "epoch": 0.4740532959326788, + "grad_norm": 2.4976578231295874, + "learning_rate": 5.6612619397420225e-06, + "loss": 0.3675, + "step": 3380 + }, + { + "epoch": 0.47419354838709676, + "grad_norm": 1.7961631359952799, + "learning_rate": 5.65901057233562e-06, + "loss": 0.3696, + "step": 3381 + }, + { + "epoch": 0.4743338008415147, + "grad_norm": 2.4715599775620998, + "learning_rate": 5.656759068946992e-06, + "loss": 0.3623, + "step": 3382 + }, + { + "epoch": 0.4744740532959327, + "grad_norm": 2.3838031648049163, + "learning_rate": 5.6545074300407184e-06, + "loss": 0.3363, + "step": 3383 + }, + { + "epoch": 0.47461430575035063, + "grad_norm": 2.3269334854698323, + "learning_rate": 5.652255656081409e-06, + "loss": 0.3637, + "step": 3384 + }, + { + "epoch": 0.4747545582047686, + "grad_norm": 2.3025510033872134, + "learning_rate": 5.650003747533701e-06, + "loss": 0.3759, + "step": 3385 + }, + { + "epoch": 0.47489481065918654, + "grad_norm": 2.8000856011793434, + "learning_rate": 5.647751704862263e-06, + "loss": 0.4076, + "step": 3386 + }, + { + "epoch": 0.4750350631136045, + "grad_norm": 2.9542182634490133, + "learning_rate": 5.645499528531785e-06, + "loss": 0.3276, + "step": 3387 + }, + { + "epoch": 0.47517531556802245, + "grad_norm": 2.3767712663778657, + "learning_rate": 5.643247219006989e-06, + "loss": 0.3556, + "step": 3388 + }, + { + "epoch": 0.4753155680224404, + "grad_norm": 1.8810863892643879, + "learning_rate": 5.640994776752626e-06, + "loss": 0.3316, + "step": 3389 + }, + { + "epoch": 0.47545582047685836, + "grad_norm": 2.780808317281315, + "learning_rate": 5.638742202233466e-06, + "loss": 0.394, + "step": 3390 + }, + { + "epoch": 0.4755960729312763, + "grad_norm": 2.22458493936865, + "learning_rate": 5.636489495914316e-06, + "loss": 0.3958, + "step": 3391 + }, + { + "epoch": 0.47573632538569427, + "grad_norm": 1.764911068413006, + "learning_rate": 5.6342366582600035e-06, + "loss": 0.3388, + "step": 3392 + }, + { + "epoch": 0.4758765778401122, + "grad_norm": 1.7564479702726126, + "learning_rate": 5.6319836897353915e-06, + "loss": 0.3925, + "step": 3393 + }, + { + "epoch": 0.4760168302945302, + "grad_norm": 2.2664148466623737, + "learning_rate": 5.629730590805358e-06, + "loss": 0.3489, + "step": 3394 + }, + { + "epoch": 0.4761570827489481, + "grad_norm": 3.1882549214695652, + "learning_rate": 5.627477361934818e-06, + "loss": 0.3658, + "step": 3395 + }, + { + "epoch": 0.47629733520336603, + "grad_norm": 2.184436505806564, + "learning_rate": 5.625224003588708e-06, + "loss": 0.3775, + "step": 3396 + }, + { + "epoch": 0.476437587657784, + "grad_norm": 2.5345394283440466, + "learning_rate": 5.6229705162319926e-06, + "loss": 0.3417, + "step": 3397 + }, + { + "epoch": 0.47657784011220194, + "grad_norm": 2.015614827448289, + "learning_rate": 5.620716900329664e-06, + "loss": 0.3766, + "step": 3398 + }, + { + "epoch": 0.4767180925666199, + "grad_norm": 2.294413902391428, + "learning_rate": 5.61846315634674e-06, + "loss": 0.3823, + "step": 3399 + }, + { + "epoch": 0.47685834502103785, + "grad_norm": 1.9069105160886886, + "learning_rate": 5.616209284748263e-06, + "loss": 0.3784, + "step": 3400 + }, + { + "epoch": 0.4769985974754558, + "grad_norm": 1.8348091716443151, + "learning_rate": 5.613955285999306e-06, + "loss": 0.3303, + "step": 3401 + }, + { + "epoch": 0.47713884992987377, + "grad_norm": 2.007718688910692, + "learning_rate": 5.611701160564965e-06, + "loss": 0.3633, + "step": 3402 + }, + { + "epoch": 0.4772791023842917, + "grad_norm": 1.810191558901829, + "learning_rate": 5.609446908910363e-06, + "loss": 0.3429, + "step": 3403 + }, + { + "epoch": 0.4774193548387097, + "grad_norm": 2.0450417265797167, + "learning_rate": 5.607192531500651e-06, + "loss": 0.3701, + "step": 3404 + }, + { + "epoch": 0.47755960729312763, + "grad_norm": 2.090208560083418, + "learning_rate": 5.6049380288010016e-06, + "loss": 0.3565, + "step": 3405 + }, + { + "epoch": 0.4776998597475456, + "grad_norm": 2.1022561215581375, + "learning_rate": 5.6026834012766155e-06, + "loss": 0.412, + "step": 3406 + }, + { + "epoch": 0.47784011220196354, + "grad_norm": 1.9550504425253594, + "learning_rate": 5.600428649392722e-06, + "loss": 0.3017, + "step": 3407 + }, + { + "epoch": 0.4779803646563815, + "grad_norm": 2.109797117594696, + "learning_rate": 5.5981737736145695e-06, + "loss": 0.3862, + "step": 3408 + }, + { + "epoch": 0.47812061711079945, + "grad_norm": 2.838247152092697, + "learning_rate": 5.5959187744074396e-06, + "loss": 0.3504, + "step": 3409 + }, + { + "epoch": 0.4782608695652174, + "grad_norm": 1.537278744044049, + "learning_rate": 5.593663652236632e-06, + "loss": 0.3721, + "step": 3410 + }, + { + "epoch": 0.47840112201963536, + "grad_norm": 2.149475069059831, + "learning_rate": 5.59140840756748e-06, + "loss": 0.3577, + "step": 3411 + }, + { + "epoch": 0.4785413744740533, + "grad_norm": 1.9685367287059707, + "learning_rate": 5.589153040865333e-06, + "loss": 0.3619, + "step": 3412 + }, + { + "epoch": 0.47868162692847127, + "grad_norm": 1.6135134033459715, + "learning_rate": 5.586897552595573e-06, + "loss": 0.3213, + "step": 3413 + }, + { + "epoch": 0.4788218793828892, + "grad_norm": 1.6070999805846602, + "learning_rate": 5.584641943223603e-06, + "loss": 0.3719, + "step": 3414 + }, + { + "epoch": 0.4789621318373071, + "grad_norm": 2.690635626192631, + "learning_rate": 5.582386213214853e-06, + "loss": 0.39, + "step": 3415 + }, + { + "epoch": 0.4791023842917251, + "grad_norm": 2.3162580317251464, + "learning_rate": 5.580130363034777e-06, + "loss": 0.3718, + "step": 3416 + }, + { + "epoch": 0.47924263674614304, + "grad_norm": 2.0521383391106705, + "learning_rate": 5.577874393148854e-06, + "loss": 0.402, + "step": 3417 + }, + { + "epoch": 0.479382889200561, + "grad_norm": 1.8445005020881315, + "learning_rate": 5.575618304022586e-06, + "loss": 0.3662, + "step": 3418 + }, + { + "epoch": 0.47952314165497895, + "grad_norm": 1.8203273657981704, + "learning_rate": 5.573362096121504e-06, + "loss": 0.3407, + "step": 3419 + }, + { + "epoch": 0.4796633941093969, + "grad_norm": 2.1431574633729737, + "learning_rate": 5.571105769911159e-06, + "loss": 0.3632, + "step": 3420 + }, + { + "epoch": 0.47980364656381486, + "grad_norm": 1.8571702724571157, + "learning_rate": 5.568849325857127e-06, + "loss": 0.4112, + "step": 3421 + }, + { + "epoch": 0.4799438990182328, + "grad_norm": 1.8449189957615024, + "learning_rate": 5.566592764425012e-06, + "loss": 0.3585, + "step": 3422 + }, + { + "epoch": 0.48008415147265077, + "grad_norm": 2.538577552847517, + "learning_rate": 5.5643360860804385e-06, + "loss": 0.3459, + "step": 3423 + }, + { + "epoch": 0.4802244039270687, + "grad_norm": 2.346928183693328, + "learning_rate": 5.562079291289058e-06, + "loss": 0.3817, + "step": 3424 + }, + { + "epoch": 0.4803646563814867, + "grad_norm": 1.8396235696691525, + "learning_rate": 5.559822380516539e-06, + "loss": 0.3171, + "step": 3425 + }, + { + "epoch": 0.48050490883590463, + "grad_norm": 2.1224421646272247, + "learning_rate": 5.557565354228586e-06, + "loss": 0.3417, + "step": 3426 + }, + { + "epoch": 0.4806451612903226, + "grad_norm": 2.1039042419356018, + "learning_rate": 5.555308212890917e-06, + "loss": 0.3585, + "step": 3427 + }, + { + "epoch": 0.48078541374474054, + "grad_norm": 1.7546328120825783, + "learning_rate": 5.553050956969278e-06, + "loss": 0.3367, + "step": 3428 + }, + { + "epoch": 0.4809256661991585, + "grad_norm": 7.208643853292607, + "learning_rate": 5.550793586929437e-06, + "loss": 0.368, + "step": 3429 + }, + { + "epoch": 0.48106591865357645, + "grad_norm": 1.7010825973541792, + "learning_rate": 5.54853610323719e-06, + "loss": 0.3535, + "step": 3430 + }, + { + "epoch": 0.4812061711079944, + "grad_norm": 1.8869018465844478, + "learning_rate": 5.546278506358348e-06, + "loss": 0.3718, + "step": 3431 + }, + { + "epoch": 0.48134642356241236, + "grad_norm": 2.030271676022865, + "learning_rate": 5.544020796758754e-06, + "loss": 0.3677, + "step": 3432 + }, + { + "epoch": 0.4814866760168303, + "grad_norm": 2.6586432950909806, + "learning_rate": 5.5417629749042676e-06, + "loss": 0.4122, + "step": 3433 + }, + { + "epoch": 0.4816269284712483, + "grad_norm": 2.542980750739732, + "learning_rate": 5.539505041260779e-06, + "loss": 0.3571, + "step": 3434 + }, + { + "epoch": 0.48176718092566617, + "grad_norm": 4.073130856448745, + "learning_rate": 5.537246996294192e-06, + "loss": 0.4074, + "step": 3435 + }, + { + "epoch": 0.4819074333800841, + "grad_norm": 2.113311259678906, + "learning_rate": 5.534988840470442e-06, + "loss": 0.3503, + "step": 3436 + }, + { + "epoch": 0.4820476858345021, + "grad_norm": 2.152660644243231, + "learning_rate": 5.532730574255482e-06, + "loss": 0.3299, + "step": 3437 + }, + { + "epoch": 0.48218793828892004, + "grad_norm": 2.427454864050904, + "learning_rate": 5.530472198115291e-06, + "loss": 0.3888, + "step": 3438 + }, + { + "epoch": 0.482328190743338, + "grad_norm": 1.940220246053316, + "learning_rate": 5.528213712515867e-06, + "loss": 0.3913, + "step": 3439 + }, + { + "epoch": 0.48246844319775595, + "grad_norm": 2.677326463477122, + "learning_rate": 5.525955117923235e-06, + "loss": 0.3949, + "step": 3440 + }, + { + "epoch": 0.4826086956521739, + "grad_norm": 1.6475824968120016, + "learning_rate": 5.523696414803438e-06, + "loss": 0.3056, + "step": 3441 + }, + { + "epoch": 0.48274894810659186, + "grad_norm": 1.9489180766817007, + "learning_rate": 5.521437603622545e-06, + "loss": 0.3531, + "step": 3442 + }, + { + "epoch": 0.4828892005610098, + "grad_norm": 2.204931722262916, + "learning_rate": 5.519178684846646e-06, + "loss": 0.3832, + "step": 3443 + }, + { + "epoch": 0.48302945301542777, + "grad_norm": 1.840306833225775, + "learning_rate": 5.51691965894185e-06, + "loss": 0.3152, + "step": 3444 + }, + { + "epoch": 0.4831697054698457, + "grad_norm": 1.677703324231544, + "learning_rate": 5.514660526374298e-06, + "loss": 0.3215, + "step": 3445 + }, + { + "epoch": 0.4833099579242637, + "grad_norm": 2.2958022746104345, + "learning_rate": 5.51240128761014e-06, + "loss": 0.3513, + "step": 3446 + }, + { + "epoch": 0.48345021037868163, + "grad_norm": 2.3874673371801958, + "learning_rate": 5.510141943115556e-06, + "loss": 0.4343, + "step": 3447 + }, + { + "epoch": 0.4835904628330996, + "grad_norm": 1.7850819912413272, + "learning_rate": 5.507882493356745e-06, + "loss": 0.3758, + "step": 3448 + }, + { + "epoch": 0.48373071528751754, + "grad_norm": 1.9206499987290235, + "learning_rate": 5.505622938799933e-06, + "loss": 0.3659, + "step": 3449 + }, + { + "epoch": 0.4838709677419355, + "grad_norm": 1.9581775468045717, + "learning_rate": 5.503363279911359e-06, + "loss": 0.3599, + "step": 3450 + }, + { + "epoch": 0.48401122019635345, + "grad_norm": 1.8537623861237322, + "learning_rate": 5.501103517157288e-06, + "loss": 0.3478, + "step": 3451 + }, + { + "epoch": 0.4841514726507714, + "grad_norm": 3.8072046495226872, + "learning_rate": 5.498843651004008e-06, + "loss": 0.3362, + "step": 3452 + }, + { + "epoch": 0.48429172510518936, + "grad_norm": 2.633015791744595, + "learning_rate": 5.496583681917824e-06, + "loss": 0.3554, + "step": 3453 + }, + { + "epoch": 0.4844319775596073, + "grad_norm": 2.2976589811192754, + "learning_rate": 5.494323610365069e-06, + "loss": 0.3684, + "step": 3454 + }, + { + "epoch": 0.4845722300140252, + "grad_norm": 2.609890549948166, + "learning_rate": 5.49206343681209e-06, + "loss": 0.3713, + "step": 3455 + }, + { + "epoch": 0.4847124824684432, + "grad_norm": 2.8041734144593478, + "learning_rate": 5.489803161725258e-06, + "loss": 0.3958, + "step": 3456 + }, + { + "epoch": 0.48485273492286113, + "grad_norm": 1.6574972243019672, + "learning_rate": 5.487542785570966e-06, + "loss": 0.3363, + "step": 3457 + }, + { + "epoch": 0.4849929873772791, + "grad_norm": 2.212293337205754, + "learning_rate": 5.485282308815626e-06, + "loss": 0.3647, + "step": 3458 + }, + { + "epoch": 0.48513323983169704, + "grad_norm": 1.7722098516844986, + "learning_rate": 5.483021731925673e-06, + "loss": 0.3631, + "step": 3459 + }, + { + "epoch": 0.485273492286115, + "grad_norm": 1.913147999806873, + "learning_rate": 5.48076105536756e-06, + "loss": 0.3592, + "step": 3460 + }, + { + "epoch": 0.48541374474053295, + "grad_norm": 2.7117592393407013, + "learning_rate": 5.478500279607762e-06, + "loss": 0.4193, + "step": 3461 + }, + { + "epoch": 0.4855539971949509, + "grad_norm": 1.650907501000138, + "learning_rate": 5.476239405112775e-06, + "loss": 0.3357, + "step": 3462 + }, + { + "epoch": 0.48569424964936886, + "grad_norm": 2.4092608354290586, + "learning_rate": 5.4739784323491115e-06, + "loss": 0.3772, + "step": 3463 + }, + { + "epoch": 0.4858345021037868, + "grad_norm": 1.58084815728063, + "learning_rate": 5.471717361783312e-06, + "loss": 0.3757, + "step": 3464 + }, + { + "epoch": 0.48597475455820477, + "grad_norm": 2.0404149974120016, + "learning_rate": 5.469456193881931e-06, + "loss": 0.3428, + "step": 3465 + }, + { + "epoch": 0.4861150070126227, + "grad_norm": 1.6985842868575367, + "learning_rate": 5.467194929111544e-06, + "loss": 0.3597, + "step": 3466 + }, + { + "epoch": 0.4862552594670407, + "grad_norm": 2.16851019069396, + "learning_rate": 5.464933567938746e-06, + "loss": 0.4061, + "step": 3467 + }, + { + "epoch": 0.48639551192145863, + "grad_norm": 1.9651278523597377, + "learning_rate": 5.462672110830155e-06, + "loss": 0.3642, + "step": 3468 + }, + { + "epoch": 0.4865357643758766, + "grad_norm": 1.7147008188939055, + "learning_rate": 5.460410558252408e-06, + "loss": 0.3377, + "step": 3469 + }, + { + "epoch": 0.48667601683029454, + "grad_norm": 2.391262193918918, + "learning_rate": 5.458148910672157e-06, + "loss": 0.3791, + "step": 3470 + }, + { + "epoch": 0.4868162692847125, + "grad_norm": 1.7631322768208932, + "learning_rate": 5.455887168556081e-06, + "loss": 0.374, + "step": 3471 + }, + { + "epoch": 0.48695652173913045, + "grad_norm": 1.6535578025955604, + "learning_rate": 5.453625332370872e-06, + "loss": 0.3319, + "step": 3472 + }, + { + "epoch": 0.4870967741935484, + "grad_norm": 2.333337261559949, + "learning_rate": 5.451363402583244e-06, + "loss": 0.3613, + "step": 3473 + }, + { + "epoch": 0.48723702664796636, + "grad_norm": 1.7053602756422965, + "learning_rate": 5.449101379659933e-06, + "loss": 0.3389, + "step": 3474 + }, + { + "epoch": 0.48737727910238426, + "grad_norm": 2.121889755355027, + "learning_rate": 5.446839264067689e-06, + "loss": 0.435, + "step": 3475 + }, + { + "epoch": 0.4875175315568022, + "grad_norm": 1.9865257442447095, + "learning_rate": 5.444577056273284e-06, + "loss": 0.3367, + "step": 3476 + }, + { + "epoch": 0.4876577840112202, + "grad_norm": 1.7861729237269275, + "learning_rate": 5.442314756743511e-06, + "loss": 0.3057, + "step": 3477 + }, + { + "epoch": 0.48779803646563813, + "grad_norm": 2.5492892378035408, + "learning_rate": 5.4400523659451775e-06, + "loss": 0.3304, + "step": 3478 + }, + { + "epoch": 0.4879382889200561, + "grad_norm": 2.2886710499470477, + "learning_rate": 5.4377898843451126e-06, + "loss": 0.352, + "step": 3479 + }, + { + "epoch": 0.48807854137447404, + "grad_norm": 5.545186562181602, + "learning_rate": 5.4355273124101645e-06, + "loss": 0.3184, + "step": 3480 + }, + { + "epoch": 0.488218793828892, + "grad_norm": 3.774633624758788, + "learning_rate": 5.4332646506071986e-06, + "loss": 0.356, + "step": 3481 + }, + { + "epoch": 0.48835904628330995, + "grad_norm": 2.3165079387196945, + "learning_rate": 5.4310018994030974e-06, + "loss": 0.3809, + "step": 3482 + }, + { + "epoch": 0.4884992987377279, + "grad_norm": 2.2852046602778207, + "learning_rate": 5.428739059264767e-06, + "loss": 0.4027, + "step": 3483 + }, + { + "epoch": 0.48863955119214586, + "grad_norm": 2.0430223155247664, + "learning_rate": 5.426476130659126e-06, + "loss": 0.3837, + "step": 3484 + }, + { + "epoch": 0.4887798036465638, + "grad_norm": 3.440571331884098, + "learning_rate": 5.424213114053115e-06, + "loss": 0.3405, + "step": 3485 + }, + { + "epoch": 0.48892005610098177, + "grad_norm": 1.9599636745006113, + "learning_rate": 5.421950009913694e-06, + "loss": 0.3447, + "step": 3486 + }, + { + "epoch": 0.4890603085553997, + "grad_norm": 1.5293759146540704, + "learning_rate": 5.4196868187078335e-06, + "loss": 0.3432, + "step": 3487 + }, + { + "epoch": 0.4892005610098177, + "grad_norm": 4.38111145409954, + "learning_rate": 5.417423540902531e-06, + "loss": 0.3826, + "step": 3488 + }, + { + "epoch": 0.48934081346423564, + "grad_norm": 1.8238884506330926, + "learning_rate": 5.4151601769647974e-06, + "loss": 0.3353, + "step": 3489 + }, + { + "epoch": 0.4894810659186536, + "grad_norm": 1.8828008501001097, + "learning_rate": 5.412896727361663e-06, + "loss": 0.3363, + "step": 3490 + }, + { + "epoch": 0.48962131837307155, + "grad_norm": 1.7800706673642333, + "learning_rate": 5.410633192560173e-06, + "loss": 0.3578, + "step": 3491 + }, + { + "epoch": 0.4897615708274895, + "grad_norm": 2.9638739445969104, + "learning_rate": 5.408369573027391e-06, + "loss": 0.3391, + "step": 3492 + }, + { + "epoch": 0.48990182328190746, + "grad_norm": 1.9088439177844712, + "learning_rate": 5.406105869230402e-06, + "loss": 0.3822, + "step": 3493 + }, + { + "epoch": 0.4900420757363254, + "grad_norm": 2.2339910430728644, + "learning_rate": 5.403842081636303e-06, + "loss": 0.3821, + "step": 3494 + }, + { + "epoch": 0.4901823281907433, + "grad_norm": 1.9917402775248143, + "learning_rate": 5.401578210712214e-06, + "loss": 0.3654, + "step": 3495 + }, + { + "epoch": 0.49032258064516127, + "grad_norm": 2.0570830678009218, + "learning_rate": 5.399314256925265e-06, + "loss": 0.3252, + "step": 3496 + }, + { + "epoch": 0.4904628330995792, + "grad_norm": 1.8526916923649244, + "learning_rate": 5.39705022074261e-06, + "loss": 0.3715, + "step": 3497 + }, + { + "epoch": 0.4906030855539972, + "grad_norm": 2.1243883262366685, + "learning_rate": 5.394786102631415e-06, + "loss": 0.345, + "step": 3498 + }, + { + "epoch": 0.49074333800841513, + "grad_norm": 1.8091739523079235, + "learning_rate": 5.392521903058867e-06, + "loss": 0.3482, + "step": 3499 + }, + { + "epoch": 0.4908835904628331, + "grad_norm": 2.0642107024543783, + "learning_rate": 5.390257622492166e-06, + "loss": 0.3701, + "step": 3500 + }, + { + "epoch": 0.49102384291725104, + "grad_norm": 1.9751294636847048, + "learning_rate": 5.387993261398532e-06, + "loss": 0.4017, + "step": 3501 + }, + { + "epoch": 0.491164095371669, + "grad_norm": 1.6122819886206905, + "learning_rate": 5.3857288202452e-06, + "loss": 0.3876, + "step": 3502 + }, + { + "epoch": 0.49130434782608695, + "grad_norm": 2.4007172823973297, + "learning_rate": 5.383464299499419e-06, + "loss": 0.3724, + "step": 3503 + }, + { + "epoch": 0.4914446002805049, + "grad_norm": 1.8953572468456088, + "learning_rate": 5.381199699628459e-06, + "loss": 0.3597, + "step": 3504 + }, + { + "epoch": 0.49158485273492286, + "grad_norm": 1.6357099751958848, + "learning_rate": 5.378935021099604e-06, + "loss": 0.3415, + "step": 3505 + }, + { + "epoch": 0.4917251051893408, + "grad_norm": 2.2879267143436914, + "learning_rate": 5.376670264380157e-06, + "loss": 0.3784, + "step": 3506 + }, + { + "epoch": 0.49186535764375877, + "grad_norm": 2.1087541584907776, + "learning_rate": 5.374405429937431e-06, + "loss": 0.3354, + "step": 3507 + }, + { + "epoch": 0.4920056100981767, + "grad_norm": 1.8314910938276547, + "learning_rate": 5.3721405182387595e-06, + "loss": 0.3663, + "step": 3508 + }, + { + "epoch": 0.4921458625525947, + "grad_norm": 2.0976318130632574, + "learning_rate": 5.369875529751492e-06, + "loss": 0.4138, + "step": 3509 + }, + { + "epoch": 0.49228611500701264, + "grad_norm": 1.986068183446864, + "learning_rate": 5.367610464942994e-06, + "loss": 0.4021, + "step": 3510 + }, + { + "epoch": 0.4924263674614306, + "grad_norm": 2.073745817303248, + "learning_rate": 5.365345324280646e-06, + "loss": 0.3797, + "step": 3511 + }, + { + "epoch": 0.49256661991584855, + "grad_norm": 3.389437841205313, + "learning_rate": 5.363080108231843e-06, + "loss": 0.3435, + "step": 3512 + }, + { + "epoch": 0.4927068723702665, + "grad_norm": 1.8007754245092866, + "learning_rate": 5.360814817263995e-06, + "loss": 0.3422, + "step": 3513 + }, + { + "epoch": 0.49284712482468446, + "grad_norm": 2.0893833687192767, + "learning_rate": 5.35854945184453e-06, + "loss": 0.3853, + "step": 3514 + }, + { + "epoch": 0.49298737727910236, + "grad_norm": 1.7853410153588127, + "learning_rate": 5.35628401244089e-06, + "loss": 0.3553, + "step": 3515 + }, + { + "epoch": 0.4931276297335203, + "grad_norm": 2.5773009861835248, + "learning_rate": 5.354018499520536e-06, + "loss": 0.3357, + "step": 3516 + }, + { + "epoch": 0.49326788218793827, + "grad_norm": 2.091574720309659, + "learning_rate": 5.351752913550936e-06, + "loss": 0.3581, + "step": 3517 + }, + { + "epoch": 0.4934081346423562, + "grad_norm": 1.6789155827578406, + "learning_rate": 5.349487254999579e-06, + "loss": 0.3428, + "step": 3518 + }, + { + "epoch": 0.4935483870967742, + "grad_norm": 1.900068673348398, + "learning_rate": 5.34722152433397e-06, + "loss": 0.3888, + "step": 3519 + }, + { + "epoch": 0.49368863955119213, + "grad_norm": 6.758937993342709, + "learning_rate": 5.3449557220216245e-06, + "loss": 0.3516, + "step": 3520 + }, + { + "epoch": 0.4938288920056101, + "grad_norm": 2.5406540797447383, + "learning_rate": 5.342689848530077e-06, + "loss": 0.4309, + "step": 3521 + }, + { + "epoch": 0.49396914446002804, + "grad_norm": 1.7292062532727037, + "learning_rate": 5.3404239043268734e-06, + "loss": 0.3941, + "step": 3522 + }, + { + "epoch": 0.494109396914446, + "grad_norm": 1.669510925769043, + "learning_rate": 5.338157889879575e-06, + "loss": 0.3484, + "step": 3523 + }, + { + "epoch": 0.49424964936886395, + "grad_norm": 2.198314311379287, + "learning_rate": 5.335891805655758e-06, + "loss": 0.3321, + "step": 3524 + }, + { + "epoch": 0.4943899018232819, + "grad_norm": 2.3344231739716306, + "learning_rate": 5.333625652123014e-06, + "loss": 0.3567, + "step": 3525 + }, + { + "epoch": 0.49453015427769986, + "grad_norm": 1.6880870786868467, + "learning_rate": 5.331359429748948e-06, + "loss": 0.331, + "step": 3526 + }, + { + "epoch": 0.4946704067321178, + "grad_norm": 2.1150280965467405, + "learning_rate": 5.329093139001179e-06, + "loss": 0.3071, + "step": 3527 + }, + { + "epoch": 0.4948106591865358, + "grad_norm": 2.5290660148925186, + "learning_rate": 5.326826780347339e-06, + "loss": 0.384, + "step": 3528 + }, + { + "epoch": 0.49495091164095373, + "grad_norm": 3.138408884876908, + "learning_rate": 5.324560354255077e-06, + "loss": 0.3707, + "step": 3529 + }, + { + "epoch": 0.4950911640953717, + "grad_norm": 2.194923339234974, + "learning_rate": 5.322293861192052e-06, + "loss": 0.3669, + "step": 3530 + }, + { + "epoch": 0.49523141654978964, + "grad_norm": 2.3783969678521357, + "learning_rate": 5.320027301625944e-06, + "loss": 0.3815, + "step": 3531 + }, + { + "epoch": 0.4953716690042076, + "grad_norm": 1.4931852190753048, + "learning_rate": 5.317760676024436e-06, + "loss": 0.377, + "step": 3532 + }, + { + "epoch": 0.49551192145862555, + "grad_norm": 1.5125846649938026, + "learning_rate": 5.315493984855233e-06, + "loss": 0.3236, + "step": 3533 + }, + { + "epoch": 0.4956521739130435, + "grad_norm": 3.03376901313347, + "learning_rate": 5.313227228586049e-06, + "loss": 0.3972, + "step": 3534 + }, + { + "epoch": 0.4957924263674614, + "grad_norm": 2.072108687948789, + "learning_rate": 5.310960407684616e-06, + "loss": 0.3874, + "step": 3535 + }, + { + "epoch": 0.49593267882187936, + "grad_norm": 2.2044055004081016, + "learning_rate": 5.308693522618674e-06, + "loss": 0.385, + "step": 3536 + }, + { + "epoch": 0.4960729312762973, + "grad_norm": 1.8551794850589114, + "learning_rate": 5.306426573855983e-06, + "loss": 0.3996, + "step": 3537 + }, + { + "epoch": 0.49621318373071527, + "grad_norm": 2.2629413363969197, + "learning_rate": 5.3041595618643075e-06, + "loss": 0.3636, + "step": 3538 + }, + { + "epoch": 0.4963534361851332, + "grad_norm": 2.051905964836472, + "learning_rate": 5.301892487111431e-06, + "loss": 0.3739, + "step": 3539 + }, + { + "epoch": 0.4964936886395512, + "grad_norm": 2.8018398439106567, + "learning_rate": 5.2996253500651494e-06, + "loss": 0.3318, + "step": 3540 + }, + { + "epoch": 0.49663394109396913, + "grad_norm": 1.9338218477361708, + "learning_rate": 5.297358151193271e-06, + "loss": 0.3572, + "step": 3541 + }, + { + "epoch": 0.4967741935483871, + "grad_norm": 1.854012650797458, + "learning_rate": 5.2950908909636144e-06, + "loss": 0.3432, + "step": 3542 + }, + { + "epoch": 0.49691444600280504, + "grad_norm": 2.7044443427783893, + "learning_rate": 5.292823569844016e-06, + "loss": 0.3533, + "step": 3543 + }, + { + "epoch": 0.497054698457223, + "grad_norm": 1.7242928340813182, + "learning_rate": 5.2905561883023185e-06, + "loss": 0.3878, + "step": 3544 + }, + { + "epoch": 0.49719495091164095, + "grad_norm": 1.7061705350711203, + "learning_rate": 5.288288746806381e-06, + "loss": 0.3568, + "step": 3545 + }, + { + "epoch": 0.4973352033660589, + "grad_norm": 2.1937535951590355, + "learning_rate": 5.286021245824075e-06, + "loss": 0.3746, + "step": 3546 + }, + { + "epoch": 0.49747545582047686, + "grad_norm": 2.710114015946774, + "learning_rate": 5.283753685823284e-06, + "loss": 0.3749, + "step": 3547 + }, + { + "epoch": 0.4976157082748948, + "grad_norm": 1.7869258598782944, + "learning_rate": 5.2814860672719015e-06, + "loss": 0.4042, + "step": 3548 + }, + { + "epoch": 0.4977559607293128, + "grad_norm": 2.0259953536858712, + "learning_rate": 5.2792183906378355e-06, + "loss": 0.3557, + "step": 3549 + }, + { + "epoch": 0.49789621318373073, + "grad_norm": 1.730785263427423, + "learning_rate": 5.276950656389006e-06, + "loss": 0.3417, + "step": 3550 + }, + { + "epoch": 0.4980364656381487, + "grad_norm": 1.6818038737135972, + "learning_rate": 5.274682864993344e-06, + "loss": 0.3935, + "step": 3551 + }, + { + "epoch": 0.49817671809256664, + "grad_norm": 1.7037149509958602, + "learning_rate": 5.272415016918792e-06, + "loss": 0.3445, + "step": 3552 + }, + { + "epoch": 0.4983169705469846, + "grad_norm": 1.990156387433322, + "learning_rate": 5.270147112633304e-06, + "loss": 0.3534, + "step": 3553 + }, + { + "epoch": 0.49845722300140255, + "grad_norm": 1.7371029406995182, + "learning_rate": 5.2678791526048465e-06, + "loss": 0.3388, + "step": 3554 + }, + { + "epoch": 0.49859747545582045, + "grad_norm": 2.3898317105406837, + "learning_rate": 5.265611137301397e-06, + "loss": 0.3668, + "step": 3555 + }, + { + "epoch": 0.4987377279102384, + "grad_norm": 2.6702580606570874, + "learning_rate": 5.263343067190945e-06, + "loss": 0.3949, + "step": 3556 + }, + { + "epoch": 0.49887798036465636, + "grad_norm": 1.9808089285783332, + "learning_rate": 5.261074942741492e-06, + "loss": 0.3311, + "step": 3557 + }, + { + "epoch": 0.4990182328190743, + "grad_norm": 1.8343156509286567, + "learning_rate": 5.258806764421048e-06, + "loss": 0.3429, + "step": 3558 + }, + { + "epoch": 0.49915848527349227, + "grad_norm": 2.0537746707969164, + "learning_rate": 5.256538532697636e-06, + "loss": 0.3697, + "step": 3559 + }, + { + "epoch": 0.4992987377279102, + "grad_norm": 1.907017975244886, + "learning_rate": 5.254270248039291e-06, + "loss": 0.3934, + "step": 3560 + }, + { + "epoch": 0.4994389901823282, + "grad_norm": 2.1977482108936703, + "learning_rate": 5.2520019109140555e-06, + "loss": 0.3232, + "step": 3561 + }, + { + "epoch": 0.49957924263674613, + "grad_norm": 2.6939093876746236, + "learning_rate": 5.249733521789987e-06, + "loss": 0.3435, + "step": 3562 + }, + { + "epoch": 0.4997194950911641, + "grad_norm": 2.0596759998739578, + "learning_rate": 5.247465081135153e-06, + "loss": 0.358, + "step": 3563 + }, + { + "epoch": 0.49985974754558204, + "grad_norm": 2.9638768404908706, + "learning_rate": 5.245196589417625e-06, + "loss": 0.3614, + "step": 3564 + }, + { + "epoch": 0.5, + "grad_norm": 1.8750654209167998, + "learning_rate": 5.2429280471054954e-06, + "loss": 0.3104, + "step": 3565 + }, + { + "epoch": 0.500140252454418, + "grad_norm": 2.060994726027136, + "learning_rate": 5.24065945466686e-06, + "loss": 0.3716, + "step": 3566 + }, + { + "epoch": 0.5002805049088359, + "grad_norm": 1.9472321380215232, + "learning_rate": 5.238390812569828e-06, + "loss": 0.3796, + "step": 3567 + }, + { + "epoch": 0.5004207573632539, + "grad_norm": 1.9917087353635414, + "learning_rate": 5.2361221212825175e-06, + "loss": 0.3885, + "step": 3568 + }, + { + "epoch": 0.5005610098176718, + "grad_norm": 1.4907006650390289, + "learning_rate": 5.2338533812730565e-06, + "loss": 0.3813, + "step": 3569 + }, + { + "epoch": 0.5007012622720898, + "grad_norm": 2.0036792291909427, + "learning_rate": 5.2315845930095845e-06, + "loss": 0.3572, + "step": 3570 + }, + { + "epoch": 0.5008415147265077, + "grad_norm": 2.1267380058297434, + "learning_rate": 5.229315756960249e-06, + "loss": 0.3263, + "step": 3571 + }, + { + "epoch": 0.5009817671809257, + "grad_norm": 2.264529259207773, + "learning_rate": 5.227046873593211e-06, + "loss": 0.4315, + "step": 3572 + }, + { + "epoch": 0.5011220196353436, + "grad_norm": 4.043644503082531, + "learning_rate": 5.224777943376635e-06, + "loss": 0.3515, + "step": 3573 + }, + { + "epoch": 0.5012622720897616, + "grad_norm": 2.473619125936771, + "learning_rate": 5.222508966778702e-06, + "loss": 0.3678, + "step": 3574 + }, + { + "epoch": 0.5014025245441796, + "grad_norm": 2.0144578970275067, + "learning_rate": 5.220239944267598e-06, + "loss": 0.3573, + "step": 3575 + }, + { + "epoch": 0.5015427769985975, + "grad_norm": 2.1544543543345003, + "learning_rate": 5.21797087631152e-06, + "loss": 0.3841, + "step": 3576 + }, + { + "epoch": 0.5016830294530155, + "grad_norm": 1.7002194319081827, + "learning_rate": 5.215701763378673e-06, + "loss": 0.3742, + "step": 3577 + }, + { + "epoch": 0.5018232819074334, + "grad_norm": 5.329862339324754, + "learning_rate": 5.213432605937278e-06, + "loss": 0.3282, + "step": 3578 + }, + { + "epoch": 0.5019635343618514, + "grad_norm": 2.3313369612032413, + "learning_rate": 5.211163404455553e-06, + "loss": 0.3547, + "step": 3579 + }, + { + "epoch": 0.5021037868162693, + "grad_norm": 1.9136849167195042, + "learning_rate": 5.208894159401735e-06, + "loss": 0.41, + "step": 3580 + }, + { + "epoch": 0.5022440392706873, + "grad_norm": 2.0810312012913914, + "learning_rate": 5.206624871244066e-06, + "loss": 0.3375, + "step": 3581 + }, + { + "epoch": 0.5023842917251052, + "grad_norm": 1.9107683133379803, + "learning_rate": 5.204355540450799e-06, + "loss": 0.3901, + "step": 3582 + }, + { + "epoch": 0.5025245441795232, + "grad_norm": 2.1011609682990424, + "learning_rate": 5.202086167490196e-06, + "loss": 0.3749, + "step": 3583 + }, + { + "epoch": 0.5026647966339411, + "grad_norm": 1.8980022920568553, + "learning_rate": 5.199816752830523e-06, + "loss": 0.3784, + "step": 3584 + }, + { + "epoch": 0.5028050490883591, + "grad_norm": 1.8199829982131124, + "learning_rate": 5.197547296940059e-06, + "loss": 0.37, + "step": 3585 + }, + { + "epoch": 0.502945301542777, + "grad_norm": 2.9263206565308026, + "learning_rate": 5.19527780028709e-06, + "loss": 0.4367, + "step": 3586 + }, + { + "epoch": 0.5030855539971949, + "grad_norm": 1.8786349348515863, + "learning_rate": 5.19300826333991e-06, + "loss": 0.3172, + "step": 3587 + }, + { + "epoch": 0.5032258064516129, + "grad_norm": 2.3408089049836764, + "learning_rate": 5.190738686566826e-06, + "loss": 0.362, + "step": 3588 + }, + { + "epoch": 0.5033660589060308, + "grad_norm": 2.537648719977014, + "learning_rate": 5.188469070436145e-06, + "loss": 0.3214, + "step": 3589 + }, + { + "epoch": 0.5035063113604488, + "grad_norm": 2.062053632118277, + "learning_rate": 5.186199415416188e-06, + "loss": 0.4148, + "step": 3590 + }, + { + "epoch": 0.5036465638148667, + "grad_norm": 1.6267476587508785, + "learning_rate": 5.183929721975282e-06, + "loss": 0.3257, + "step": 3591 + }, + { + "epoch": 0.5037868162692847, + "grad_norm": 2.3694154433751047, + "learning_rate": 5.181659990581764e-06, + "loss": 0.3757, + "step": 3592 + }, + { + "epoch": 0.5039270687237026, + "grad_norm": 1.6616743977684734, + "learning_rate": 5.1793902217039775e-06, + "loss": 0.3486, + "step": 3593 + }, + { + "epoch": 0.5040673211781206, + "grad_norm": 2.2586167958791594, + "learning_rate": 5.177120415810271e-06, + "loss": 0.3302, + "step": 3594 + }, + { + "epoch": 0.5042075736325385, + "grad_norm": 1.9068985132938883, + "learning_rate": 5.1748505733690035e-06, + "loss": 0.3592, + "step": 3595 + }, + { + "epoch": 0.5043478260869565, + "grad_norm": 2.5848335606544186, + "learning_rate": 5.172580694848541e-06, + "loss": 0.3949, + "step": 3596 + }, + { + "epoch": 0.5044880785413745, + "grad_norm": 1.7342057703984415, + "learning_rate": 5.170310780717259e-06, + "loss": 0.3646, + "step": 3597 + }, + { + "epoch": 0.5046283309957924, + "grad_norm": 1.9273732010315718, + "learning_rate": 5.1680408314435385e-06, + "loss": 0.3116, + "step": 3598 + }, + { + "epoch": 0.5047685834502104, + "grad_norm": 2.009923159520715, + "learning_rate": 5.1657708474957645e-06, + "loss": 0.3863, + "step": 3599 + }, + { + "epoch": 0.5049088359046283, + "grad_norm": 2.21667964137972, + "learning_rate": 5.163500829342334e-06, + "loss": 0.3442, + "step": 3600 + }, + { + "epoch": 0.5050490883590463, + "grad_norm": 1.883318378029018, + "learning_rate": 5.16123077745165e-06, + "loss": 0.3613, + "step": 3601 + }, + { + "epoch": 0.5051893408134642, + "grad_norm": 2.1520639227268417, + "learning_rate": 5.158960692292122e-06, + "loss": 0.3846, + "step": 3602 + }, + { + "epoch": 0.5053295932678822, + "grad_norm": 1.798095752848141, + "learning_rate": 5.156690574332167e-06, + "loss": 0.3953, + "step": 3603 + }, + { + "epoch": 0.5054698457223001, + "grad_norm": 2.810529569634874, + "learning_rate": 5.154420424040205e-06, + "loss": 0.3171, + "step": 3604 + }, + { + "epoch": 0.5056100981767181, + "grad_norm": 1.724401303761626, + "learning_rate": 5.152150241884669e-06, + "loss": 0.3198, + "step": 3605 + }, + { + "epoch": 0.505750350631136, + "grad_norm": 2.76260840777175, + "learning_rate": 5.149880028333992e-06, + "loss": 0.3707, + "step": 3606 + }, + { + "epoch": 0.505890603085554, + "grad_norm": 3.0973069521989802, + "learning_rate": 5.147609783856619e-06, + "loss": 0.3905, + "step": 3607 + }, + { + "epoch": 0.506030855539972, + "grad_norm": 2.164701401908316, + "learning_rate": 5.145339508920998e-06, + "loss": 0.3626, + "step": 3608 + }, + { + "epoch": 0.5061711079943899, + "grad_norm": 2.131523441437247, + "learning_rate": 5.143069203995586e-06, + "loss": 0.3517, + "step": 3609 + }, + { + "epoch": 0.5063113604488079, + "grad_norm": 2.3198181036641934, + "learning_rate": 5.140798869548841e-06, + "loss": 0.3654, + "step": 3610 + }, + { + "epoch": 0.5064516129032258, + "grad_norm": 2.3831153552092545, + "learning_rate": 5.138528506049234e-06, + "loss": 0.379, + "step": 3611 + }, + { + "epoch": 0.5065918653576438, + "grad_norm": 1.9456881731060405, + "learning_rate": 5.1362581139652375e-06, + "loss": 0.3575, + "step": 3612 + }, + { + "epoch": 0.5067321178120617, + "grad_norm": 1.5284935411261253, + "learning_rate": 5.133987693765332e-06, + "loss": 0.419, + "step": 3613 + }, + { + "epoch": 0.5068723702664797, + "grad_norm": 1.9078647465780565, + "learning_rate": 5.131717245918001e-06, + "loss": 0.3807, + "step": 3614 + }, + { + "epoch": 0.5070126227208976, + "grad_norm": 1.6538089541583236, + "learning_rate": 5.129446770891738e-06, + "loss": 0.3516, + "step": 3615 + }, + { + "epoch": 0.5071528751753156, + "grad_norm": 2.3782428887681077, + "learning_rate": 5.1271762691550375e-06, + "loss": 0.3643, + "step": 3616 + }, + { + "epoch": 0.5072931276297336, + "grad_norm": 2.3744836547121904, + "learning_rate": 5.124905741176402e-06, + "loss": 0.3823, + "step": 3617 + }, + { + "epoch": 0.5074333800841515, + "grad_norm": 1.7064750690622985, + "learning_rate": 5.122635187424339e-06, + "loss": 0.3316, + "step": 3618 + }, + { + "epoch": 0.5075736325385695, + "grad_norm": 4.934697188620978, + "learning_rate": 5.120364608367363e-06, + "loss": 0.3847, + "step": 3619 + }, + { + "epoch": 0.5077138849929874, + "grad_norm": 1.9209568342554906, + "learning_rate": 5.11809400447399e-06, + "loss": 0.4058, + "step": 3620 + }, + { + "epoch": 0.5078541374474054, + "grad_norm": 3.247819168816595, + "learning_rate": 5.115823376212744e-06, + "loss": 0.3651, + "step": 3621 + }, + { + "epoch": 0.5079943899018233, + "grad_norm": 1.863488271796615, + "learning_rate": 5.113552724052154e-06, + "loss": 0.3654, + "step": 3622 + }, + { + "epoch": 0.5081346423562413, + "grad_norm": 2.1383164480627426, + "learning_rate": 5.111282048460753e-06, + "loss": 0.3802, + "step": 3623 + }, + { + "epoch": 0.5082748948106592, + "grad_norm": 1.8316205505846053, + "learning_rate": 5.109011349907079e-06, + "loss": 0.3543, + "step": 3624 + }, + { + "epoch": 0.5084151472650772, + "grad_norm": 2.4632399672010936, + "learning_rate": 5.106740628859674e-06, + "loss": 0.363, + "step": 3625 + }, + { + "epoch": 0.508555399719495, + "grad_norm": 1.7758033868540197, + "learning_rate": 5.1044698857870875e-06, + "loss": 0.3646, + "step": 3626 + }, + { + "epoch": 0.508695652173913, + "grad_norm": 1.9179749032288962, + "learning_rate": 5.102199121157869e-06, + "loss": 0.3397, + "step": 3627 + }, + { + "epoch": 0.508835904628331, + "grad_norm": 3.0746252622449557, + "learning_rate": 5.099928335440575e-06, + "loss": 0.3228, + "step": 3628 + }, + { + "epoch": 0.5089761570827489, + "grad_norm": 1.968233010364934, + "learning_rate": 5.097657529103769e-06, + "loss": 0.3754, + "step": 3629 + }, + { + "epoch": 0.5091164095371669, + "grad_norm": 1.7492075215669722, + "learning_rate": 5.095386702616012e-06, + "loss": 0.3433, + "step": 3630 + }, + { + "epoch": 0.5092566619915848, + "grad_norm": 4.336460501256189, + "learning_rate": 5.093115856445876e-06, + "loss": 0.3771, + "step": 3631 + }, + { + "epoch": 0.5093969144460028, + "grad_norm": 2.24498730815197, + "learning_rate": 5.090844991061934e-06, + "loss": 0.3238, + "step": 3632 + }, + { + "epoch": 0.5095371669004207, + "grad_norm": 2.1608264290488095, + "learning_rate": 5.088574106932762e-06, + "loss": 0.3813, + "step": 3633 + }, + { + "epoch": 0.5096774193548387, + "grad_norm": 2.1584342176899267, + "learning_rate": 5.0863032045269435e-06, + "loss": 0.3762, + "step": 3634 + }, + { + "epoch": 0.5098176718092566, + "grad_norm": 1.5673269101450311, + "learning_rate": 5.0840322843130606e-06, + "loss": 0.3247, + "step": 3635 + }, + { + "epoch": 0.5099579242636746, + "grad_norm": 1.6479393807576714, + "learning_rate": 5.081761346759703e-06, + "loss": 0.3493, + "step": 3636 + }, + { + "epoch": 0.5100981767180925, + "grad_norm": 2.0252255820382308, + "learning_rate": 5.079490392335463e-06, + "loss": 0.3473, + "step": 3637 + }, + { + "epoch": 0.5102384291725105, + "grad_norm": 2.540666091417728, + "learning_rate": 5.077219421508936e-06, + "loss": 0.3793, + "step": 3638 + }, + { + "epoch": 0.5103786816269285, + "grad_norm": 4.3856048664415335, + "learning_rate": 5.074948434748721e-06, + "loss": 0.3508, + "step": 3639 + }, + { + "epoch": 0.5105189340813464, + "grad_norm": 2.013121003244975, + "learning_rate": 5.072677432523418e-06, + "loss": 0.3542, + "step": 3640 + }, + { + "epoch": 0.5106591865357644, + "grad_norm": 2.7909712043788404, + "learning_rate": 5.070406415301637e-06, + "loss": 0.3897, + "step": 3641 + }, + { + "epoch": 0.5107994389901823, + "grad_norm": 1.9031620967863687, + "learning_rate": 5.068135383551983e-06, + "loss": 0.357, + "step": 3642 + }, + { + "epoch": 0.5109396914446003, + "grad_norm": 2.06840986238842, + "learning_rate": 5.065864337743068e-06, + "loss": 0.4024, + "step": 3643 + }, + { + "epoch": 0.5110799438990182, + "grad_norm": 1.7308085433049116, + "learning_rate": 5.06359327834351e-06, + "loss": 0.4072, + "step": 3644 + }, + { + "epoch": 0.5112201963534362, + "grad_norm": 1.795656271906842, + "learning_rate": 5.06132220582192e-06, + "loss": 0.3863, + "step": 3645 + }, + { + "epoch": 0.5113604488078541, + "grad_norm": 2.0431594322012825, + "learning_rate": 5.059051120646924e-06, + "loss": 0.3829, + "step": 3646 + }, + { + "epoch": 0.5115007012622721, + "grad_norm": 2.0105892232522793, + "learning_rate": 5.0567800232871404e-06, + "loss": 0.3652, + "step": 3647 + }, + { + "epoch": 0.51164095371669, + "grad_norm": 2.2719089049022347, + "learning_rate": 5.0545089142111945e-06, + "loss": 0.3538, + "step": 3648 + }, + { + "epoch": 0.511781206171108, + "grad_norm": 2.0813496746217575, + "learning_rate": 5.052237793887717e-06, + "loss": 0.3653, + "step": 3649 + }, + { + "epoch": 0.511921458625526, + "grad_norm": 2.3126416549598363, + "learning_rate": 5.049966662785335e-06, + "loss": 0.3948, + "step": 3650 + }, + { + "epoch": 0.5120617110799439, + "grad_norm": 2.4722443006960737, + "learning_rate": 5.047695521372681e-06, + "loss": 0.3796, + "step": 3651 + }, + { + "epoch": 0.5122019635343619, + "grad_norm": 1.6406566798466669, + "learning_rate": 5.045424370118389e-06, + "loss": 0.3172, + "step": 3652 + }, + { + "epoch": 0.5123422159887798, + "grad_norm": 3.292379632022623, + "learning_rate": 5.043153209491095e-06, + "loss": 0.3419, + "step": 3653 + }, + { + "epoch": 0.5124824684431978, + "grad_norm": 1.9069795306927018, + "learning_rate": 5.04088203995944e-06, + "loss": 0.3676, + "step": 3654 + }, + { + "epoch": 0.5126227208976157, + "grad_norm": 2.2450576117245093, + "learning_rate": 5.03861086199206e-06, + "loss": 0.3471, + "step": 3655 + }, + { + "epoch": 0.5127629733520337, + "grad_norm": 1.822749916806008, + "learning_rate": 5.036339676057599e-06, + "loss": 0.3918, + "step": 3656 + }, + { + "epoch": 0.5129032258064516, + "grad_norm": 1.960890932784039, + "learning_rate": 5.0340684826247e-06, + "loss": 0.3236, + "step": 3657 + }, + { + "epoch": 0.5130434782608696, + "grad_norm": 1.8710556504227125, + "learning_rate": 5.031797282162007e-06, + "loss": 0.3751, + "step": 3658 + }, + { + "epoch": 0.5131837307152876, + "grad_norm": 1.8810988102482114, + "learning_rate": 5.029526075138167e-06, + "loss": 0.3539, + "step": 3659 + }, + { + "epoch": 0.5133239831697055, + "grad_norm": 1.880791113259444, + "learning_rate": 5.027254862021829e-06, + "loss": 0.3937, + "step": 3660 + }, + { + "epoch": 0.5134642356241235, + "grad_norm": 1.8354542197615762, + "learning_rate": 5.024983643281639e-06, + "loss": 0.3614, + "step": 3661 + }, + { + "epoch": 0.5136044880785414, + "grad_norm": 1.9082546012440125, + "learning_rate": 5.022712419386248e-06, + "loss": 0.402, + "step": 3662 + }, + { + "epoch": 0.5137447405329594, + "grad_norm": 2.2186705884020026, + "learning_rate": 5.020441190804309e-06, + "loss": 0.376, + "step": 3663 + }, + { + "epoch": 0.5138849929873773, + "grad_norm": 2.4373399730872958, + "learning_rate": 5.018169958004474e-06, + "loss": 0.3995, + "step": 3664 + }, + { + "epoch": 0.5140252454417953, + "grad_norm": 2.752847151315721, + "learning_rate": 5.015898721455394e-06, + "loss": 0.3203, + "step": 3665 + }, + { + "epoch": 0.5141654978962131, + "grad_norm": 2.470483099145603, + "learning_rate": 5.013627481625725e-06, + "loss": 0.3551, + "step": 3666 + }, + { + "epoch": 0.5143057503506311, + "grad_norm": 1.817088731097163, + "learning_rate": 5.011356238984121e-06, + "loss": 0.3939, + "step": 3667 + }, + { + "epoch": 0.514446002805049, + "grad_norm": 2.81747767455658, + "learning_rate": 5.009084993999234e-06, + "loss": 0.358, + "step": 3668 + }, + { + "epoch": 0.514586255259467, + "grad_norm": 2.215296784747574, + "learning_rate": 5.006813747139722e-06, + "loss": 0.3998, + "step": 3669 + }, + { + "epoch": 0.514726507713885, + "grad_norm": 2.158325302325101, + "learning_rate": 5.004542498874244e-06, + "loss": 0.3307, + "step": 3670 + }, + { + "epoch": 0.5148667601683029, + "grad_norm": 2.1141317351987348, + "learning_rate": 5.002271249671451e-06, + "loss": 0.3485, + "step": 3671 + }, + { + "epoch": 0.5150070126227209, + "grad_norm": 1.9612532281697088, + "learning_rate": 5e-06, + "loss": 0.3578, + "step": 3672 + }, + { + "epoch": 0.5151472650771388, + "grad_norm": 2.0698050398859325, + "learning_rate": 4.997728750328551e-06, + "loss": 0.3458, + "step": 3673 + }, + { + "epoch": 0.5152875175315568, + "grad_norm": 2.2935707042998605, + "learning_rate": 4.995457501125758e-06, + "loss": 0.3664, + "step": 3674 + }, + { + "epoch": 0.5154277699859747, + "grad_norm": 1.951666447579458, + "learning_rate": 4.9931862528602784e-06, + "loss": 0.3625, + "step": 3675 + }, + { + "epoch": 0.5155680224403927, + "grad_norm": 3.081016770022228, + "learning_rate": 4.990915006000767e-06, + "loss": 0.3442, + "step": 3676 + }, + { + "epoch": 0.5157082748948106, + "grad_norm": 1.8115296890685593, + "learning_rate": 4.988643761015881e-06, + "loss": 0.3434, + "step": 3677 + }, + { + "epoch": 0.5158485273492286, + "grad_norm": 1.6531970118394341, + "learning_rate": 4.986372518374276e-06, + "loss": 0.3027, + "step": 3678 + }, + { + "epoch": 0.5159887798036465, + "grad_norm": 3.186839016466021, + "learning_rate": 4.984101278544607e-06, + "loss": 0.3046, + "step": 3679 + }, + { + "epoch": 0.5161290322580645, + "grad_norm": 1.7695053806056051, + "learning_rate": 4.981830041995527e-06, + "loss": 0.3846, + "step": 3680 + }, + { + "epoch": 0.5162692847124825, + "grad_norm": 1.729302489067152, + "learning_rate": 4.9795588091956906e-06, + "loss": 0.3639, + "step": 3681 + }, + { + "epoch": 0.5164095371669004, + "grad_norm": 1.8667818530624347, + "learning_rate": 4.977287580613752e-06, + "loss": 0.3874, + "step": 3682 + }, + { + "epoch": 0.5165497896213184, + "grad_norm": 2.211993069953479, + "learning_rate": 4.975016356718364e-06, + "loss": 0.3514, + "step": 3683 + }, + { + "epoch": 0.5166900420757363, + "grad_norm": 2.602827343174196, + "learning_rate": 4.9727451379781735e-06, + "loss": 0.3593, + "step": 3684 + }, + { + "epoch": 0.5168302945301543, + "grad_norm": 2.4770237827014014, + "learning_rate": 4.970473924861835e-06, + "loss": 0.3509, + "step": 3685 + }, + { + "epoch": 0.5169705469845722, + "grad_norm": 1.836307179993493, + "learning_rate": 4.968202717837996e-06, + "loss": 0.3625, + "step": 3686 + }, + { + "epoch": 0.5171107994389902, + "grad_norm": 2.473193262491885, + "learning_rate": 4.9659315173753026e-06, + "loss": 0.3937, + "step": 3687 + }, + { + "epoch": 0.5172510518934081, + "grad_norm": 2.1534112128276806, + "learning_rate": 4.963660323942403e-06, + "loss": 0.3567, + "step": 3688 + }, + { + "epoch": 0.5173913043478261, + "grad_norm": 1.8020580341422825, + "learning_rate": 4.961389138007942e-06, + "loss": 0.3647, + "step": 3689 + }, + { + "epoch": 0.517531556802244, + "grad_norm": 1.6571617495935018, + "learning_rate": 4.9591179600405615e-06, + "loss": 0.3688, + "step": 3690 + }, + { + "epoch": 0.517671809256662, + "grad_norm": 2.2593634007579593, + "learning_rate": 4.956846790508906e-06, + "loss": 0.3648, + "step": 3691 + }, + { + "epoch": 0.51781206171108, + "grad_norm": 1.8395922705612158, + "learning_rate": 4.954575629881613e-06, + "loss": 0.3832, + "step": 3692 + }, + { + "epoch": 0.5179523141654979, + "grad_norm": 1.942237844758353, + "learning_rate": 4.9523044786273214e-06, + "loss": 0.3417, + "step": 3693 + }, + { + "epoch": 0.5180925666199159, + "grad_norm": 2.1211825469874213, + "learning_rate": 4.950033337214667e-06, + "loss": 0.3591, + "step": 3694 + }, + { + "epoch": 0.5182328190743338, + "grad_norm": 2.213239462175558, + "learning_rate": 4.947762206112285e-06, + "loss": 0.3899, + "step": 3695 + }, + { + "epoch": 0.5183730715287518, + "grad_norm": 1.825873006752082, + "learning_rate": 4.945491085788806e-06, + "loss": 0.3789, + "step": 3696 + }, + { + "epoch": 0.5185133239831697, + "grad_norm": 1.8896315305283706, + "learning_rate": 4.943219976712862e-06, + "loss": 0.3432, + "step": 3697 + }, + { + "epoch": 0.5186535764375877, + "grad_norm": 2.1163108671823134, + "learning_rate": 4.940948879353078e-06, + "loss": 0.3378, + "step": 3698 + }, + { + "epoch": 0.5187938288920056, + "grad_norm": 2.6058237728980043, + "learning_rate": 4.93867779417808e-06, + "loss": 0.292, + "step": 3699 + }, + { + "epoch": 0.5189340813464236, + "grad_norm": 1.6119031585209922, + "learning_rate": 4.936406721656492e-06, + "loss": 0.3112, + "step": 3700 + }, + { + "epoch": 0.5190743338008416, + "grad_norm": 2.1174694591423013, + "learning_rate": 4.934135662256932e-06, + "loss": 0.3844, + "step": 3701 + }, + { + "epoch": 0.5192145862552595, + "grad_norm": 7.220989193768638, + "learning_rate": 4.9318646164480175e-06, + "loss": 0.3769, + "step": 3702 + }, + { + "epoch": 0.5193548387096775, + "grad_norm": 3.0002214032172283, + "learning_rate": 4.929593584698363e-06, + "loss": 0.3206, + "step": 3703 + }, + { + "epoch": 0.5194950911640954, + "grad_norm": 2.3109269849268466, + "learning_rate": 4.927322567476584e-06, + "loss": 0.3531, + "step": 3704 + }, + { + "epoch": 0.5196353436185134, + "grad_norm": 4.505017344518672, + "learning_rate": 4.925051565251282e-06, + "loss": 0.3527, + "step": 3705 + }, + { + "epoch": 0.5197755960729312, + "grad_norm": 2.2171866053336813, + "learning_rate": 4.922780578491067e-06, + "loss": 0.4189, + "step": 3706 + }, + { + "epoch": 0.5199158485273492, + "grad_norm": 2.1710867789349786, + "learning_rate": 4.92050960766454e-06, + "loss": 0.3455, + "step": 3707 + }, + { + "epoch": 0.5200561009817671, + "grad_norm": 2.3662179906647873, + "learning_rate": 4.918238653240299e-06, + "loss": 0.35, + "step": 3708 + }, + { + "epoch": 0.5201963534361851, + "grad_norm": 1.7285903828289937, + "learning_rate": 4.915967715686941e-06, + "loss": 0.3587, + "step": 3709 + }, + { + "epoch": 0.520336605890603, + "grad_norm": 1.4787423326643403, + "learning_rate": 4.913696795473058e-06, + "loss": 0.3185, + "step": 3710 + }, + { + "epoch": 0.520476858345021, + "grad_norm": 1.82482076836039, + "learning_rate": 4.911425893067239e-06, + "loss": 0.3134, + "step": 3711 + }, + { + "epoch": 0.520617110799439, + "grad_norm": 2.0070776874995264, + "learning_rate": 4.909155008938068e-06, + "loss": 0.3574, + "step": 3712 + }, + { + "epoch": 0.5207573632538569, + "grad_norm": 1.733567333321009, + "learning_rate": 4.906884143554126e-06, + "loss": 0.384, + "step": 3713 + }, + { + "epoch": 0.5208976157082749, + "grad_norm": 3.099018735323684, + "learning_rate": 4.9046132973839895e-06, + "loss": 0.3718, + "step": 3714 + }, + { + "epoch": 0.5210378681626928, + "grad_norm": 3.708477417507696, + "learning_rate": 4.9023424708962334e-06, + "loss": 0.391, + "step": 3715 + }, + { + "epoch": 0.5211781206171108, + "grad_norm": 2.453857021086955, + "learning_rate": 4.900071664559427e-06, + "loss": 0.3457, + "step": 3716 + }, + { + "epoch": 0.5213183730715287, + "grad_norm": 2.3273839603265722, + "learning_rate": 4.897800878842133e-06, + "loss": 0.3587, + "step": 3717 + }, + { + "epoch": 0.5214586255259467, + "grad_norm": 2.996705312370072, + "learning_rate": 4.895530114212913e-06, + "loss": 0.3471, + "step": 3718 + }, + { + "epoch": 0.5215988779803646, + "grad_norm": 6.115569659544578, + "learning_rate": 4.893259371140326e-06, + "loss": 0.3556, + "step": 3719 + }, + { + "epoch": 0.5217391304347826, + "grad_norm": 1.8397433826583083, + "learning_rate": 4.890988650092922e-06, + "loss": 0.3339, + "step": 3720 + }, + { + "epoch": 0.5218793828892005, + "grad_norm": 1.6192506170703065, + "learning_rate": 4.8887179515392465e-06, + "loss": 0.3689, + "step": 3721 + }, + { + "epoch": 0.5220196353436185, + "grad_norm": 2.706800839084897, + "learning_rate": 4.886447275947846e-06, + "loss": 0.3595, + "step": 3722 + }, + { + "epoch": 0.5221598877980365, + "grad_norm": 2.034408461241353, + "learning_rate": 4.8841766237872555e-06, + "loss": 0.3373, + "step": 3723 + }, + { + "epoch": 0.5223001402524544, + "grad_norm": 2.7751033454415457, + "learning_rate": 4.8819059955260105e-06, + "loss": 0.3561, + "step": 3724 + }, + { + "epoch": 0.5224403927068724, + "grad_norm": 2.630091860367186, + "learning_rate": 4.87963539163264e-06, + "loss": 0.3449, + "step": 3725 + }, + { + "epoch": 0.5225806451612903, + "grad_norm": 2.647019808612603, + "learning_rate": 4.877364812575663e-06, + "loss": 0.3791, + "step": 3726 + }, + { + "epoch": 0.5227208976157083, + "grad_norm": 1.968073229043711, + "learning_rate": 4.875094258823601e-06, + "loss": 0.3294, + "step": 3727 + }, + { + "epoch": 0.5228611500701262, + "grad_norm": 1.6059381026314938, + "learning_rate": 4.872823730844966e-06, + "loss": 0.349, + "step": 3728 + }, + { + "epoch": 0.5230014025245442, + "grad_norm": 2.0848585903231633, + "learning_rate": 4.8705532291082644e-06, + "loss": 0.356, + "step": 3729 + }, + { + "epoch": 0.5231416549789621, + "grad_norm": 1.9999534482306636, + "learning_rate": 4.868282754082e-06, + "loss": 0.3147, + "step": 3730 + }, + { + "epoch": 0.5232819074333801, + "grad_norm": 1.9251763746040882, + "learning_rate": 4.866012306234669e-06, + "loss": 0.3915, + "step": 3731 + }, + { + "epoch": 0.523422159887798, + "grad_norm": 2.9706849869494767, + "learning_rate": 4.863741886034764e-06, + "loss": 0.3318, + "step": 3732 + }, + { + "epoch": 0.523562412342216, + "grad_norm": 1.9825949908056737, + "learning_rate": 4.861471493950767e-06, + "loss": 0.3618, + "step": 3733 + }, + { + "epoch": 0.523702664796634, + "grad_norm": 2.237727391286987, + "learning_rate": 4.859201130451161e-06, + "loss": 0.3849, + "step": 3734 + }, + { + "epoch": 0.5238429172510519, + "grad_norm": 2.0979564032884186, + "learning_rate": 4.856930796004417e-06, + "loss": 0.3926, + "step": 3735 + }, + { + "epoch": 0.5239831697054699, + "grad_norm": 1.9183888023945277, + "learning_rate": 4.854660491079004e-06, + "loss": 0.3871, + "step": 3736 + }, + { + "epoch": 0.5241234221598878, + "grad_norm": 2.1296195139007383, + "learning_rate": 4.852390216143383e-06, + "loss": 0.3588, + "step": 3737 + }, + { + "epoch": 0.5242636746143058, + "grad_norm": 2.303929815369107, + "learning_rate": 4.850119971666009e-06, + "loss": 0.3681, + "step": 3738 + }, + { + "epoch": 0.5244039270687237, + "grad_norm": 2.1408420647601116, + "learning_rate": 4.847849758115333e-06, + "loss": 0.363, + "step": 3739 + }, + { + "epoch": 0.5245441795231417, + "grad_norm": 1.9511784733841133, + "learning_rate": 4.845579575959795e-06, + "loss": 0.392, + "step": 3740 + }, + { + "epoch": 0.5246844319775597, + "grad_norm": 1.954720783636369, + "learning_rate": 4.843309425667834e-06, + "loss": 0.3235, + "step": 3741 + }, + { + "epoch": 0.5248246844319776, + "grad_norm": 5.039376277173541, + "learning_rate": 4.841039307707878e-06, + "loss": 0.3806, + "step": 3742 + }, + { + "epoch": 0.5249649368863956, + "grad_norm": 3.0999156509737253, + "learning_rate": 4.838769222548349e-06, + "loss": 0.3626, + "step": 3743 + }, + { + "epoch": 0.5251051893408135, + "grad_norm": 2.447411559458202, + "learning_rate": 4.8364991706576655e-06, + "loss": 0.3339, + "step": 3744 + }, + { + "epoch": 0.5252454417952315, + "grad_norm": 2.9730291746718036, + "learning_rate": 4.834229152504239e-06, + "loss": 0.4044, + "step": 3745 + }, + { + "epoch": 0.5253856942496493, + "grad_norm": 1.8721455781280327, + "learning_rate": 4.831959168556464e-06, + "loss": 0.3569, + "step": 3746 + }, + { + "epoch": 0.5255259467040673, + "grad_norm": 2.014664650360418, + "learning_rate": 4.829689219282742e-06, + "loss": 0.3708, + "step": 3747 + }, + { + "epoch": 0.5256661991584852, + "grad_norm": 2.043516009173536, + "learning_rate": 4.827419305151461e-06, + "loss": 0.3498, + "step": 3748 + }, + { + "epoch": 0.5258064516129032, + "grad_norm": 6.739501984079254, + "learning_rate": 4.825149426630999e-06, + "loss": 0.3854, + "step": 3749 + }, + { + "epoch": 0.5259467040673211, + "grad_norm": 2.2036201515826037, + "learning_rate": 4.822879584189732e-06, + "loss": 0.3522, + "step": 3750 + }, + { + "epoch": 0.5260869565217391, + "grad_norm": 2.270229002538842, + "learning_rate": 4.820609778296024e-06, + "loss": 0.3502, + "step": 3751 + }, + { + "epoch": 0.526227208976157, + "grad_norm": 1.8278016676624995, + "learning_rate": 4.818340009418237e-06, + "loss": 0.3142, + "step": 3752 + }, + { + "epoch": 0.526367461430575, + "grad_norm": 1.8311794228347154, + "learning_rate": 4.8160702780247184e-06, + "loss": 0.3682, + "step": 3753 + }, + { + "epoch": 0.526507713884993, + "grad_norm": 2.9541725843735263, + "learning_rate": 4.813800584583813e-06, + "loss": 0.3885, + "step": 3754 + }, + { + "epoch": 0.5266479663394109, + "grad_norm": 1.7103564525541748, + "learning_rate": 4.8115309295638566e-06, + "loss": 0.3606, + "step": 3755 + }, + { + "epoch": 0.5267882187938289, + "grad_norm": 1.836085666602667, + "learning_rate": 4.809261313433176e-06, + "loss": 0.3691, + "step": 3756 + }, + { + "epoch": 0.5269284712482468, + "grad_norm": 2.397690647409258, + "learning_rate": 4.806991736660091e-06, + "loss": 0.3507, + "step": 3757 + }, + { + "epoch": 0.5270687237026648, + "grad_norm": 2.132516805531155, + "learning_rate": 4.8047221997129126e-06, + "loss": 0.3898, + "step": 3758 + }, + { + "epoch": 0.5272089761570827, + "grad_norm": 2.2276906150494495, + "learning_rate": 4.802452703059943e-06, + "loss": 0.296, + "step": 3759 + }, + { + "epoch": 0.5273492286115007, + "grad_norm": 2.158573833405449, + "learning_rate": 4.800183247169478e-06, + "loss": 0.3319, + "step": 3760 + }, + { + "epoch": 0.5274894810659186, + "grad_norm": 1.9254663918018025, + "learning_rate": 4.797913832509806e-06, + "loss": 0.3363, + "step": 3761 + }, + { + "epoch": 0.5276297335203366, + "grad_norm": 1.7542979051633234, + "learning_rate": 4.795644459549201e-06, + "loss": 0.3782, + "step": 3762 + }, + { + "epoch": 0.5277699859747546, + "grad_norm": 1.7331262853035079, + "learning_rate": 4.793375128755934e-06, + "loss": 0.3614, + "step": 3763 + }, + { + "epoch": 0.5279102384291725, + "grad_norm": 2.3612281645638133, + "learning_rate": 4.791105840598266e-06, + "loss": 0.3573, + "step": 3764 + }, + { + "epoch": 0.5280504908835905, + "grad_norm": 1.8234160983440382, + "learning_rate": 4.788836595544448e-06, + "loss": 0.3652, + "step": 3765 + }, + { + "epoch": 0.5281907433380084, + "grad_norm": 2.3850019558112803, + "learning_rate": 4.7865673940627255e-06, + "loss": 0.3768, + "step": 3766 + }, + { + "epoch": 0.5283309957924264, + "grad_norm": 1.6687251571349104, + "learning_rate": 4.7842982366213275e-06, + "loss": 0.3159, + "step": 3767 + }, + { + "epoch": 0.5284712482468443, + "grad_norm": 1.7348096148336387, + "learning_rate": 4.782029123688483e-06, + "loss": 0.3346, + "step": 3768 + }, + { + "epoch": 0.5286115007012623, + "grad_norm": 1.6782108647207223, + "learning_rate": 4.779760055732405e-06, + "loss": 0.3338, + "step": 3769 + }, + { + "epoch": 0.5287517531556802, + "grad_norm": 1.6730762337498548, + "learning_rate": 4.7774910332213005e-06, + "loss": 0.3408, + "step": 3770 + }, + { + "epoch": 0.5288920056100982, + "grad_norm": 3.180567514174041, + "learning_rate": 4.775222056623366e-06, + "loss": 0.4107, + "step": 3771 + }, + { + "epoch": 0.5290322580645161, + "grad_norm": 1.9225132820454358, + "learning_rate": 4.77295312640679e-06, + "loss": 0.3425, + "step": 3772 + }, + { + "epoch": 0.5291725105189341, + "grad_norm": 1.8724585793654311, + "learning_rate": 4.770684243039752e-06, + "loss": 0.3514, + "step": 3773 + }, + { + "epoch": 0.5293127629733521, + "grad_norm": 2.3046916379729803, + "learning_rate": 4.768415406990417e-06, + "loss": 0.3288, + "step": 3774 + }, + { + "epoch": 0.52945301542777, + "grad_norm": 1.8607976263537893, + "learning_rate": 4.766146618726944e-06, + "loss": 0.371, + "step": 3775 + }, + { + "epoch": 0.529593267882188, + "grad_norm": 1.736131895152826, + "learning_rate": 4.763877878717484e-06, + "loss": 0.3425, + "step": 3776 + }, + { + "epoch": 0.5297335203366059, + "grad_norm": 2.523536703799325, + "learning_rate": 4.761609187430174e-06, + "loss": 0.392, + "step": 3777 + }, + { + "epoch": 0.5298737727910239, + "grad_norm": 1.9900356863646342, + "learning_rate": 4.759340545333142e-06, + "loss": 0.3454, + "step": 3778 + }, + { + "epoch": 0.5300140252454418, + "grad_norm": 2.703840480777204, + "learning_rate": 4.757071952894506e-06, + "loss": 0.3768, + "step": 3779 + }, + { + "epoch": 0.5301542776998598, + "grad_norm": 2.5751979973973964, + "learning_rate": 4.754803410582376e-06, + "loss": 0.3776, + "step": 3780 + }, + { + "epoch": 0.5302945301542777, + "grad_norm": 2.456608625249978, + "learning_rate": 4.75253491886485e-06, + "loss": 0.335, + "step": 3781 + }, + { + "epoch": 0.5304347826086957, + "grad_norm": 2.0070395558958634, + "learning_rate": 4.750266478210014e-06, + "loss": 0.36, + "step": 3782 + }, + { + "epoch": 0.5305750350631137, + "grad_norm": 2.003743958919221, + "learning_rate": 4.747998089085945e-06, + "loss": 0.3354, + "step": 3783 + }, + { + "epoch": 0.5307152875175316, + "grad_norm": 1.8612221270357825, + "learning_rate": 4.74572975196071e-06, + "loss": 0.3478, + "step": 3784 + }, + { + "epoch": 0.5308555399719496, + "grad_norm": 2.1073672311396283, + "learning_rate": 4.743461467302364e-06, + "loss": 0.3983, + "step": 3785 + }, + { + "epoch": 0.5309957924263674, + "grad_norm": 1.686334914210802, + "learning_rate": 4.741193235578953e-06, + "loss": 0.338, + "step": 3786 + }, + { + "epoch": 0.5311360448807854, + "grad_norm": 2.02845101814661, + "learning_rate": 4.7389250572585104e-06, + "loss": 0.3569, + "step": 3787 + }, + { + "epoch": 0.5312762973352033, + "grad_norm": 2.451489140012877, + "learning_rate": 4.736656932809056e-06, + "loss": 0.3463, + "step": 3788 + }, + { + "epoch": 0.5314165497896213, + "grad_norm": 2.799396933597216, + "learning_rate": 4.734388862698605e-06, + "loss": 0.3703, + "step": 3789 + }, + { + "epoch": 0.5315568022440392, + "grad_norm": 1.8924983807938576, + "learning_rate": 4.732120847395156e-06, + "loss": 0.381, + "step": 3790 + }, + { + "epoch": 0.5316970546984572, + "grad_norm": 2.076633692684222, + "learning_rate": 4.7298528873666985e-06, + "loss": 0.3935, + "step": 3791 + }, + { + "epoch": 0.5318373071528751, + "grad_norm": 1.825428857449898, + "learning_rate": 4.72758498308121e-06, + "loss": 0.3725, + "step": 3792 + }, + { + "epoch": 0.5319775596072931, + "grad_norm": 1.8969901890478373, + "learning_rate": 4.725317135006658e-06, + "loss": 0.351, + "step": 3793 + }, + { + "epoch": 0.532117812061711, + "grad_norm": 1.6724535217441499, + "learning_rate": 4.723049343610996e-06, + "loss": 0.3038, + "step": 3794 + }, + { + "epoch": 0.532258064516129, + "grad_norm": 1.798939456041783, + "learning_rate": 4.720781609362165e-06, + "loss": 0.341, + "step": 3795 + }, + { + "epoch": 0.532398316970547, + "grad_norm": 1.785256547843999, + "learning_rate": 4.7185139327281e-06, + "loss": 0.3394, + "step": 3796 + }, + { + "epoch": 0.5325385694249649, + "grad_norm": 2.4772830721119776, + "learning_rate": 4.716246314176717e-06, + "loss": 0.3354, + "step": 3797 + }, + { + "epoch": 0.5326788218793829, + "grad_norm": 1.8989217007260413, + "learning_rate": 4.713978754175926e-06, + "loss": 0.3443, + "step": 3798 + }, + { + "epoch": 0.5328190743338008, + "grad_norm": 1.7351941073228352, + "learning_rate": 4.71171125319362e-06, + "loss": 0.3366, + "step": 3799 + }, + { + "epoch": 0.5329593267882188, + "grad_norm": 1.936322438884095, + "learning_rate": 4.709443811697683e-06, + "loss": 0.3549, + "step": 3800 + }, + { + "epoch": 0.5330995792426367, + "grad_norm": 1.5804332810257844, + "learning_rate": 4.707176430155986e-06, + "loss": 0.3533, + "step": 3801 + }, + { + "epoch": 0.5332398316970547, + "grad_norm": 1.7651832500566387, + "learning_rate": 4.704909109036387e-06, + "loss": 0.3259, + "step": 3802 + }, + { + "epoch": 0.5333800841514726, + "grad_norm": 1.9921138974225232, + "learning_rate": 4.70264184880673e-06, + "loss": 0.3786, + "step": 3803 + }, + { + "epoch": 0.5335203366058906, + "grad_norm": 2.000472251449842, + "learning_rate": 4.700374649934851e-06, + "loss": 0.345, + "step": 3804 + }, + { + "epoch": 0.5336605890603086, + "grad_norm": 3.065104097524603, + "learning_rate": 4.69810751288857e-06, + "loss": 0.3585, + "step": 3805 + }, + { + "epoch": 0.5338008415147265, + "grad_norm": 2.2290730560707375, + "learning_rate": 4.695840438135693e-06, + "loss": 0.4052, + "step": 3806 + }, + { + "epoch": 0.5339410939691445, + "grad_norm": 2.058200400483346, + "learning_rate": 4.6935734261440195e-06, + "loss": 0.4094, + "step": 3807 + }, + { + "epoch": 0.5340813464235624, + "grad_norm": 1.6804502662624656, + "learning_rate": 4.6913064773813274e-06, + "loss": 0.3973, + "step": 3808 + }, + { + "epoch": 0.5342215988779804, + "grad_norm": 3.615529151905542, + "learning_rate": 4.689039592315387e-06, + "loss": 0.3786, + "step": 3809 + }, + { + "epoch": 0.5343618513323983, + "grad_norm": 1.7978308747451779, + "learning_rate": 4.686772771413954e-06, + "loss": 0.3478, + "step": 3810 + }, + { + "epoch": 0.5345021037868163, + "grad_norm": 2.670788103296928, + "learning_rate": 4.68450601514477e-06, + "loss": 0.3661, + "step": 3811 + }, + { + "epoch": 0.5346423562412342, + "grad_norm": 2.379760288219893, + "learning_rate": 4.682239323975566e-06, + "loss": 0.3571, + "step": 3812 + }, + { + "epoch": 0.5347826086956522, + "grad_norm": 1.9908688238702827, + "learning_rate": 4.679972698374058e-06, + "loss": 0.396, + "step": 3813 + }, + { + "epoch": 0.5349228611500702, + "grad_norm": 1.8402141352111134, + "learning_rate": 4.6777061388079485e-06, + "loss": 0.3276, + "step": 3814 + }, + { + "epoch": 0.5350631136044881, + "grad_norm": 1.895674339027143, + "learning_rate": 4.675439645744924e-06, + "loss": 0.3148, + "step": 3815 + }, + { + "epoch": 0.5352033660589061, + "grad_norm": 1.7269382672036369, + "learning_rate": 4.673173219652662e-06, + "loss": 0.3524, + "step": 3816 + }, + { + "epoch": 0.535343618513324, + "grad_norm": 1.8932986310120272, + "learning_rate": 4.6709068609988225e-06, + "loss": 0.327, + "step": 3817 + }, + { + "epoch": 0.535483870967742, + "grad_norm": 1.8934102625508196, + "learning_rate": 4.668640570251054e-06, + "loss": 0.3831, + "step": 3818 + }, + { + "epoch": 0.5356241234221599, + "grad_norm": 2.4952139818859562, + "learning_rate": 4.666374347876987e-06, + "loss": 0.377, + "step": 3819 + }, + { + "epoch": 0.5357643758765779, + "grad_norm": 2.088910673526141, + "learning_rate": 4.6641081943442425e-06, + "loss": 0.3608, + "step": 3820 + }, + { + "epoch": 0.5359046283309958, + "grad_norm": 1.7012049499870237, + "learning_rate": 4.661842110120426e-06, + "loss": 0.3244, + "step": 3821 + }, + { + "epoch": 0.5360448807854138, + "grad_norm": 2.0095416390005343, + "learning_rate": 4.659576095673127e-06, + "loss": 0.4246, + "step": 3822 + }, + { + "epoch": 0.5361851332398317, + "grad_norm": 1.9607024640876656, + "learning_rate": 4.657310151469924e-06, + "loss": 0.4061, + "step": 3823 + }, + { + "epoch": 0.5363253856942497, + "grad_norm": 2.121843349574675, + "learning_rate": 4.6550442779783755e-06, + "loss": 0.3376, + "step": 3824 + }, + { + "epoch": 0.5364656381486677, + "grad_norm": 2.966589854528299, + "learning_rate": 4.65277847566603e-06, + "loss": 0.3538, + "step": 3825 + }, + { + "epoch": 0.5366058906030855, + "grad_norm": 1.6712579747240142, + "learning_rate": 4.6505127450004216e-06, + "loss": 0.3317, + "step": 3826 + }, + { + "epoch": 0.5367461430575035, + "grad_norm": 2.1949406101820985, + "learning_rate": 4.648247086449064e-06, + "loss": 0.3328, + "step": 3827 + }, + { + "epoch": 0.5368863955119214, + "grad_norm": 2.4053969419291668, + "learning_rate": 4.645981500479466e-06, + "loss": 0.34, + "step": 3828 + }, + { + "epoch": 0.5370266479663394, + "grad_norm": 3.0017739455309354, + "learning_rate": 4.643715987559111e-06, + "loss": 0.3688, + "step": 3829 + }, + { + "epoch": 0.5371669004207573, + "grad_norm": 1.4903857952898907, + "learning_rate": 4.641450548155473e-06, + "loss": 0.3544, + "step": 3830 + }, + { + "epoch": 0.5373071528751753, + "grad_norm": 2.1454245786014106, + "learning_rate": 4.639185182736008e-06, + "loss": 0.3339, + "step": 3831 + }, + { + "epoch": 0.5374474053295932, + "grad_norm": 2.1925558247087147, + "learning_rate": 4.63691989176816e-06, + "loss": 0.3689, + "step": 3832 + }, + { + "epoch": 0.5375876577840112, + "grad_norm": 1.6358050069560464, + "learning_rate": 4.634654675719355e-06, + "loss": 0.3685, + "step": 3833 + }, + { + "epoch": 0.5377279102384291, + "grad_norm": 1.8921292838631343, + "learning_rate": 4.632389535057007e-06, + "loss": 0.3562, + "step": 3834 + }, + { + "epoch": 0.5378681626928471, + "grad_norm": 1.6850069491868522, + "learning_rate": 4.6301244702485084e-06, + "loss": 0.3216, + "step": 3835 + }, + { + "epoch": 0.538008415147265, + "grad_norm": 2.067552792891055, + "learning_rate": 4.627859481761242e-06, + "loss": 0.3768, + "step": 3836 + }, + { + "epoch": 0.538148667601683, + "grad_norm": 1.8790312346765001, + "learning_rate": 4.625594570062571e-06, + "loss": 0.3639, + "step": 3837 + }, + { + "epoch": 0.538288920056101, + "grad_norm": 1.7960907924871665, + "learning_rate": 4.6233297356198446e-06, + "loss": 0.3216, + "step": 3838 + }, + { + "epoch": 0.5384291725105189, + "grad_norm": 2.1672577662996013, + "learning_rate": 4.621064978900397e-06, + "loss": 0.3975, + "step": 3839 + }, + { + "epoch": 0.5385694249649369, + "grad_norm": 2.421871357576646, + "learning_rate": 4.618800300371543e-06, + "loss": 0.3772, + "step": 3840 + }, + { + "epoch": 0.5387096774193548, + "grad_norm": 2.115980641887648, + "learning_rate": 4.616535700500583e-06, + "loss": 0.3289, + "step": 3841 + }, + { + "epoch": 0.5388499298737728, + "grad_norm": 1.9303108474949164, + "learning_rate": 4.614271179754802e-06, + "loss": 0.3653, + "step": 3842 + }, + { + "epoch": 0.5389901823281907, + "grad_norm": 1.9264690416721257, + "learning_rate": 4.612006738601469e-06, + "loss": 0.3521, + "step": 3843 + }, + { + "epoch": 0.5391304347826087, + "grad_norm": 2.4079605562014854, + "learning_rate": 4.609742377507834e-06, + "loss": 0.3744, + "step": 3844 + }, + { + "epoch": 0.5392706872370266, + "grad_norm": 2.2634823895092286, + "learning_rate": 4.607478096941133e-06, + "loss": 0.3491, + "step": 3845 + }, + { + "epoch": 0.5394109396914446, + "grad_norm": 3.1666324680556377, + "learning_rate": 4.605213897368584e-06, + "loss": 0.3527, + "step": 3846 + }, + { + "epoch": 0.5395511921458626, + "grad_norm": 1.93498953330982, + "learning_rate": 4.60294977925739e-06, + "loss": 0.3215, + "step": 3847 + }, + { + "epoch": 0.5396914446002805, + "grad_norm": 1.8191757909755943, + "learning_rate": 4.600685743074736e-06, + "loss": 0.3486, + "step": 3848 + }, + { + "epoch": 0.5398316970546985, + "grad_norm": 2.4975145382691903, + "learning_rate": 4.598421789287787e-06, + "loss": 0.3928, + "step": 3849 + }, + { + "epoch": 0.5399719495091164, + "grad_norm": 2.063299890944582, + "learning_rate": 4.596157918363699e-06, + "loss": 0.3489, + "step": 3850 + }, + { + "epoch": 0.5401122019635344, + "grad_norm": 2.0370839776625704, + "learning_rate": 4.5938941307696004e-06, + "loss": 0.3458, + "step": 3851 + }, + { + "epoch": 0.5402524544179523, + "grad_norm": 1.9805628887728552, + "learning_rate": 4.591630426972611e-06, + "loss": 0.3546, + "step": 3852 + }, + { + "epoch": 0.5403927068723703, + "grad_norm": 1.8024059587514745, + "learning_rate": 4.58936680743983e-06, + "loss": 0.3526, + "step": 3853 + }, + { + "epoch": 0.5405329593267882, + "grad_norm": 2.997080813153648, + "learning_rate": 4.587103272638339e-06, + "loss": 0.3765, + "step": 3854 + }, + { + "epoch": 0.5406732117812062, + "grad_norm": 1.6489810860904184, + "learning_rate": 4.584839823035204e-06, + "loss": 0.3616, + "step": 3855 + }, + { + "epoch": 0.5408134642356242, + "grad_norm": 2.035892053361841, + "learning_rate": 4.58257645909747e-06, + "loss": 0.3377, + "step": 3856 + }, + { + "epoch": 0.5409537166900421, + "grad_norm": 2.7038778678861553, + "learning_rate": 4.580313181292168e-06, + "loss": 0.3654, + "step": 3857 + }, + { + "epoch": 0.5410939691444601, + "grad_norm": 3.0510882077889825, + "learning_rate": 4.578049990086309e-06, + "loss": 0.3724, + "step": 3858 + }, + { + "epoch": 0.541234221598878, + "grad_norm": 2.3639348915286815, + "learning_rate": 4.575786885946886e-06, + "loss": 0.3758, + "step": 3859 + }, + { + "epoch": 0.541374474053296, + "grad_norm": 2.8292038688030243, + "learning_rate": 4.573523869340875e-06, + "loss": 0.3558, + "step": 3860 + }, + { + "epoch": 0.5415147265077139, + "grad_norm": 1.904698412694173, + "learning_rate": 4.571260940735235e-06, + "loss": 0.3687, + "step": 3861 + }, + { + "epoch": 0.5416549789621319, + "grad_norm": 1.8712332719298508, + "learning_rate": 4.568998100596903e-06, + "loss": 0.3761, + "step": 3862 + }, + { + "epoch": 0.5417952314165498, + "grad_norm": 1.8831217025186084, + "learning_rate": 4.566735349392802e-06, + "loss": 0.3501, + "step": 3863 + }, + { + "epoch": 0.5419354838709678, + "grad_norm": 1.6454830360245232, + "learning_rate": 4.564472687589836e-06, + "loss": 0.3509, + "step": 3864 + }, + { + "epoch": 0.5420757363253857, + "grad_norm": 1.8091647274663516, + "learning_rate": 4.562210115654887e-06, + "loss": 0.3747, + "step": 3865 + }, + { + "epoch": 0.5422159887798036, + "grad_norm": 1.6665782269219154, + "learning_rate": 4.5599476340548225e-06, + "loss": 0.3904, + "step": 3866 + }, + { + "epoch": 0.5423562412342215, + "grad_norm": 3.104678340105464, + "learning_rate": 4.5576852432564896e-06, + "loss": 0.3847, + "step": 3867 + }, + { + "epoch": 0.5424964936886395, + "grad_norm": 2.6605894657456677, + "learning_rate": 4.555422943726715e-06, + "loss": 0.402, + "step": 3868 + }, + { + "epoch": 0.5426367461430575, + "grad_norm": 2.0226962230352714, + "learning_rate": 4.5531607359323125e-06, + "loss": 0.3683, + "step": 3869 + }, + { + "epoch": 0.5427769985974754, + "grad_norm": 1.9697688660360277, + "learning_rate": 4.550898620340069e-06, + "loss": 0.3541, + "step": 3870 + }, + { + "epoch": 0.5429172510518934, + "grad_norm": 1.8209327656056056, + "learning_rate": 4.548636597416758e-06, + "loss": 0.3945, + "step": 3871 + }, + { + "epoch": 0.5430575035063113, + "grad_norm": 3.081041532509555, + "learning_rate": 4.546374667629131e-06, + "loss": 0.3717, + "step": 3872 + }, + { + "epoch": 0.5431977559607293, + "grad_norm": 2.605151018389007, + "learning_rate": 4.544112831443921e-06, + "loss": 0.3905, + "step": 3873 + }, + { + "epoch": 0.5433380084151472, + "grad_norm": 1.8229765164677507, + "learning_rate": 4.541851089327844e-06, + "loss": 0.3509, + "step": 3874 + }, + { + "epoch": 0.5434782608695652, + "grad_norm": 1.8193377066341037, + "learning_rate": 4.539589441747595e-06, + "loss": 0.2899, + "step": 3875 + }, + { + "epoch": 0.5436185133239831, + "grad_norm": 1.967936393105313, + "learning_rate": 4.537327889169847e-06, + "loss": 0.3223, + "step": 3876 + }, + { + "epoch": 0.5437587657784011, + "grad_norm": 2.2267729007292414, + "learning_rate": 4.535066432061256e-06, + "loss": 0.3587, + "step": 3877 + }, + { + "epoch": 0.543899018232819, + "grad_norm": 1.7616622475911814, + "learning_rate": 4.532805070888459e-06, + "loss": 0.3764, + "step": 3878 + }, + { + "epoch": 0.544039270687237, + "grad_norm": 3.1818069779830456, + "learning_rate": 4.53054380611807e-06, + "loss": 0.3705, + "step": 3879 + }, + { + "epoch": 0.544179523141655, + "grad_norm": 1.835864710886692, + "learning_rate": 4.528282638216689e-06, + "loss": 0.331, + "step": 3880 + }, + { + "epoch": 0.5443197755960729, + "grad_norm": 2.011107001759995, + "learning_rate": 4.526021567650889e-06, + "loss": 0.3621, + "step": 3881 + }, + { + "epoch": 0.5444600280504909, + "grad_norm": 1.8267039457169438, + "learning_rate": 4.523760594887228e-06, + "loss": 0.3491, + "step": 3882 + }, + { + "epoch": 0.5446002805049088, + "grad_norm": 2.0383710017535557, + "learning_rate": 4.5214997203922394e-06, + "loss": 0.4078, + "step": 3883 + }, + { + "epoch": 0.5447405329593268, + "grad_norm": 2.6147167784742043, + "learning_rate": 4.519238944632442e-06, + "loss": 0.3195, + "step": 3884 + }, + { + "epoch": 0.5448807854137447, + "grad_norm": 2.177517265137506, + "learning_rate": 4.516978268074328e-06, + "loss": 0.3893, + "step": 3885 + }, + { + "epoch": 0.5450210378681627, + "grad_norm": 2.8368955733046377, + "learning_rate": 4.5147176911843746e-06, + "loss": 0.3465, + "step": 3886 + }, + { + "epoch": 0.5451612903225806, + "grad_norm": 6.072248977240768, + "learning_rate": 4.5124572144290345e-06, + "loss": 0.3514, + "step": 3887 + }, + { + "epoch": 0.5453015427769986, + "grad_norm": 2.030698261154632, + "learning_rate": 4.510196838274742e-06, + "loss": 0.3747, + "step": 3888 + }, + { + "epoch": 0.5454417952314166, + "grad_norm": 1.682193891047895, + "learning_rate": 4.507936563187911e-06, + "loss": 0.3856, + "step": 3889 + }, + { + "epoch": 0.5455820476858345, + "grad_norm": 2.372250420936067, + "learning_rate": 4.505676389634932e-06, + "loss": 0.3155, + "step": 3890 + }, + { + "epoch": 0.5457223001402525, + "grad_norm": 2.2192439214552913, + "learning_rate": 4.5034163180821775e-06, + "loss": 0.3842, + "step": 3891 + }, + { + "epoch": 0.5458625525946704, + "grad_norm": 2.173165137134506, + "learning_rate": 4.5011563489959945e-06, + "loss": 0.3311, + "step": 3892 + }, + { + "epoch": 0.5460028050490884, + "grad_norm": 2.2312166526717365, + "learning_rate": 4.498896482842715e-06, + "loss": 0.3745, + "step": 3893 + }, + { + "epoch": 0.5461430575035063, + "grad_norm": 1.7927490631684098, + "learning_rate": 4.496636720088643e-06, + "loss": 0.3468, + "step": 3894 + }, + { + "epoch": 0.5462833099579243, + "grad_norm": 2.29388866784975, + "learning_rate": 4.4943770612000686e-06, + "loss": 0.3566, + "step": 3895 + }, + { + "epoch": 0.5464235624123422, + "grad_norm": 1.7380833202359072, + "learning_rate": 4.492117506643256e-06, + "loss": 0.3688, + "step": 3896 + }, + { + "epoch": 0.5465638148667602, + "grad_norm": 1.9628051131871231, + "learning_rate": 4.489858056884446e-06, + "loss": 0.3568, + "step": 3897 + }, + { + "epoch": 0.5467040673211782, + "grad_norm": 1.7493356397316042, + "learning_rate": 4.487598712389862e-06, + "loss": 0.3526, + "step": 3898 + }, + { + "epoch": 0.5468443197755961, + "grad_norm": 1.8857004011869167, + "learning_rate": 4.485339473625704e-06, + "loss": 0.3635, + "step": 3899 + }, + { + "epoch": 0.5469845722300141, + "grad_norm": 2.77714115053065, + "learning_rate": 4.4830803410581506e-06, + "loss": 0.4128, + "step": 3900 + }, + { + "epoch": 0.547124824684432, + "grad_norm": 2.2753753268664916, + "learning_rate": 4.480821315153356e-06, + "loss": 0.3294, + "step": 3901 + }, + { + "epoch": 0.54726507713885, + "grad_norm": 2.0920410013794273, + "learning_rate": 4.478562396377457e-06, + "loss": 0.3432, + "step": 3902 + }, + { + "epoch": 0.5474053295932679, + "grad_norm": 2.90700595007131, + "learning_rate": 4.476303585196563e-06, + "loss": 0.3558, + "step": 3903 + }, + { + "epoch": 0.5475455820476859, + "grad_norm": 3.0075940337665656, + "learning_rate": 4.474044882076766e-06, + "loss": 0.3309, + "step": 3904 + }, + { + "epoch": 0.5476858345021038, + "grad_norm": 2.2447326251155295, + "learning_rate": 4.471786287484134e-06, + "loss": 0.2912, + "step": 3905 + }, + { + "epoch": 0.5478260869565217, + "grad_norm": 2.0240551804796993, + "learning_rate": 4.46952780188471e-06, + "loss": 0.3622, + "step": 3906 + }, + { + "epoch": 0.5479663394109396, + "grad_norm": 2.793983456545589, + "learning_rate": 4.467269425744518e-06, + "loss": 0.3768, + "step": 3907 + }, + { + "epoch": 0.5481065918653576, + "grad_norm": 1.9396775837038343, + "learning_rate": 4.465011159529559e-06, + "loss": 0.3917, + "step": 3908 + }, + { + "epoch": 0.5482468443197756, + "grad_norm": 2.1350374458834924, + "learning_rate": 4.462753003705808e-06, + "loss": 0.3438, + "step": 3909 + }, + { + "epoch": 0.5483870967741935, + "grad_norm": 2.0753567262591863, + "learning_rate": 4.460494958739223e-06, + "loss": 0.3568, + "step": 3910 + }, + { + "epoch": 0.5485273492286115, + "grad_norm": 2.484770641372815, + "learning_rate": 4.458237025095733e-06, + "loss": 0.3791, + "step": 3911 + }, + { + "epoch": 0.5486676016830294, + "grad_norm": 1.9310991938205846, + "learning_rate": 4.45597920324125e-06, + "loss": 0.3475, + "step": 3912 + }, + { + "epoch": 0.5488078541374474, + "grad_norm": 2.3812475850250734, + "learning_rate": 4.453721493641655e-06, + "loss": 0.3669, + "step": 3913 + }, + { + "epoch": 0.5489481065918653, + "grad_norm": 2.515563395438916, + "learning_rate": 4.451463896762813e-06, + "loss": 0.3357, + "step": 3914 + }, + { + "epoch": 0.5490883590462833, + "grad_norm": 2.297783289266765, + "learning_rate": 4.449206413070565e-06, + "loss": 0.3753, + "step": 3915 + }, + { + "epoch": 0.5492286115007012, + "grad_norm": 1.8882573963910838, + "learning_rate": 4.446949043030724e-06, + "loss": 0.3422, + "step": 3916 + }, + { + "epoch": 0.5493688639551192, + "grad_norm": 1.9788357775971421, + "learning_rate": 4.444691787109085e-06, + "loss": 0.3183, + "step": 3917 + }, + { + "epoch": 0.5495091164095371, + "grad_norm": 2.0455493856849616, + "learning_rate": 4.442434645771416e-06, + "loss": 0.3287, + "step": 3918 + }, + { + "epoch": 0.5496493688639551, + "grad_norm": 2.2545894863214135, + "learning_rate": 4.4401776194834615e-06, + "loss": 0.3719, + "step": 3919 + }, + { + "epoch": 0.5497896213183731, + "grad_norm": 1.9582421978245121, + "learning_rate": 4.437920708710944e-06, + "loss": 0.3928, + "step": 3920 + }, + { + "epoch": 0.549929873772791, + "grad_norm": 1.843475062674204, + "learning_rate": 4.435663913919563e-06, + "loss": 0.3843, + "step": 3921 + }, + { + "epoch": 0.550070126227209, + "grad_norm": 2.362382601288497, + "learning_rate": 4.433407235574989e-06, + "loss": 0.3627, + "step": 3922 + }, + { + "epoch": 0.5502103786816269, + "grad_norm": 1.682032593736763, + "learning_rate": 4.431150674142874e-06, + "loss": 0.3514, + "step": 3923 + }, + { + "epoch": 0.5503506311360449, + "grad_norm": 1.9547678637431727, + "learning_rate": 4.428894230088842e-06, + "loss": 0.3493, + "step": 3924 + }, + { + "epoch": 0.5504908835904628, + "grad_norm": 2.3956817800160928, + "learning_rate": 4.426637903878498e-06, + "loss": 0.4052, + "step": 3925 + }, + { + "epoch": 0.5506311360448808, + "grad_norm": 1.972846477136798, + "learning_rate": 4.424381695977415e-06, + "loss": 0.3218, + "step": 3926 + }, + { + "epoch": 0.5507713884992987, + "grad_norm": 2.6929020372343873, + "learning_rate": 4.422125606851147e-06, + "loss": 0.3498, + "step": 3927 + }, + { + "epoch": 0.5509116409537167, + "grad_norm": 2.248596177324145, + "learning_rate": 4.419869636965223e-06, + "loss": 0.3458, + "step": 3928 + }, + { + "epoch": 0.5510518934081347, + "grad_norm": 2.1531526738175297, + "learning_rate": 4.417613786785147e-06, + "loss": 0.3578, + "step": 3929 + }, + { + "epoch": 0.5511921458625526, + "grad_norm": 1.8298157266694628, + "learning_rate": 4.415358056776398e-06, + "loss": 0.3121, + "step": 3930 + }, + { + "epoch": 0.5513323983169706, + "grad_norm": 2.1541765720673407, + "learning_rate": 4.413102447404428e-06, + "loss": 0.3728, + "step": 3931 + }, + { + "epoch": 0.5514726507713885, + "grad_norm": 2.0610473603531414, + "learning_rate": 4.410846959134667e-06, + "loss": 0.3419, + "step": 3932 + }, + { + "epoch": 0.5516129032258065, + "grad_norm": 1.6841794298541857, + "learning_rate": 4.4085915924325226e-06, + "loss": 0.3509, + "step": 3933 + }, + { + "epoch": 0.5517531556802244, + "grad_norm": 2.2575638050005766, + "learning_rate": 4.406336347763369e-06, + "loss": 0.3978, + "step": 3934 + }, + { + "epoch": 0.5518934081346424, + "grad_norm": 2.884226382509099, + "learning_rate": 4.404081225592562e-06, + "loss": 0.331, + "step": 3935 + }, + { + "epoch": 0.5520336605890603, + "grad_norm": 2.002456229656637, + "learning_rate": 4.401826226385431e-06, + "loss": 0.3618, + "step": 3936 + }, + { + "epoch": 0.5521739130434783, + "grad_norm": 2.516485410651428, + "learning_rate": 4.399571350607281e-06, + "loss": 0.3783, + "step": 3937 + }, + { + "epoch": 0.5523141654978962, + "grad_norm": 1.9928447761093335, + "learning_rate": 4.397316598723385e-06, + "loss": 0.4104, + "step": 3938 + }, + { + "epoch": 0.5524544179523142, + "grad_norm": 5.061050772746348, + "learning_rate": 4.395061971199e-06, + "loss": 0.3787, + "step": 3939 + }, + { + "epoch": 0.5525946704067322, + "grad_norm": 1.8323935209514903, + "learning_rate": 4.39280746849935e-06, + "loss": 0.3743, + "step": 3940 + }, + { + "epoch": 0.5527349228611501, + "grad_norm": 3.5232849764746055, + "learning_rate": 4.390553091089637e-06, + "loss": 0.3463, + "step": 3941 + }, + { + "epoch": 0.5528751753155681, + "grad_norm": 1.9255368462590554, + "learning_rate": 4.388298839435036e-06, + "loss": 0.3465, + "step": 3942 + }, + { + "epoch": 0.553015427769986, + "grad_norm": 1.9038140413529716, + "learning_rate": 4.386044714000695e-06, + "loss": 0.3391, + "step": 3943 + }, + { + "epoch": 0.553155680224404, + "grad_norm": 1.9011684915400022, + "learning_rate": 4.383790715251739e-06, + "loss": 0.3737, + "step": 3944 + }, + { + "epoch": 0.5532959326788219, + "grad_norm": 2.0758023009347295, + "learning_rate": 4.381536843653262e-06, + "loss": 0.3487, + "step": 3945 + }, + { + "epoch": 0.5534361851332398, + "grad_norm": 2.2002244618089604, + "learning_rate": 4.379283099670338e-06, + "loss": 0.3625, + "step": 3946 + }, + { + "epoch": 0.5535764375876577, + "grad_norm": 2.922614019436026, + "learning_rate": 4.377029483768009e-06, + "loss": 0.3633, + "step": 3947 + }, + { + "epoch": 0.5537166900420757, + "grad_norm": 1.7813143802184654, + "learning_rate": 4.3747759964112936e-06, + "loss": 0.3572, + "step": 3948 + }, + { + "epoch": 0.5538569424964936, + "grad_norm": 2.1892607006278904, + "learning_rate": 4.372522638065183e-06, + "loss": 0.3643, + "step": 3949 + }, + { + "epoch": 0.5539971949509116, + "grad_norm": 2.387352692218868, + "learning_rate": 4.370269409194642e-06, + "loss": 0.3452, + "step": 3950 + }, + { + "epoch": 0.5541374474053296, + "grad_norm": 1.8691123713128777, + "learning_rate": 4.36801631026461e-06, + "loss": 0.3797, + "step": 3951 + }, + { + "epoch": 0.5542776998597475, + "grad_norm": 3.09714791551377, + "learning_rate": 4.365763341739996e-06, + "loss": 0.3598, + "step": 3952 + }, + { + "epoch": 0.5544179523141655, + "grad_norm": 1.8317313853739752, + "learning_rate": 4.363510504085685e-06, + "loss": 0.2957, + "step": 3953 + }, + { + "epoch": 0.5545582047685834, + "grad_norm": 2.040786073178243, + "learning_rate": 4.361257797766537e-06, + "loss": 0.417, + "step": 3954 + }, + { + "epoch": 0.5546984572230014, + "grad_norm": 2.564329285257168, + "learning_rate": 4.359005223247378e-06, + "loss": 0.3398, + "step": 3955 + }, + { + "epoch": 0.5548387096774193, + "grad_norm": 2.1021277365958464, + "learning_rate": 4.356752780993012e-06, + "loss": 0.3701, + "step": 3956 + }, + { + "epoch": 0.5549789621318373, + "grad_norm": 3.10998651826641, + "learning_rate": 4.354500471468217e-06, + "loss": 0.3333, + "step": 3957 + }, + { + "epoch": 0.5551192145862552, + "grad_norm": 2.1287674884895367, + "learning_rate": 4.352248295137739e-06, + "loss": 0.3716, + "step": 3958 + }, + { + "epoch": 0.5552594670406732, + "grad_norm": 1.9927558836088566, + "learning_rate": 4.3499962524662995e-06, + "loss": 0.3757, + "step": 3959 + }, + { + "epoch": 0.5553997194950911, + "grad_norm": 1.9628789645176046, + "learning_rate": 4.347744343918593e-06, + "loss": 0.4037, + "step": 3960 + }, + { + "epoch": 0.5555399719495091, + "grad_norm": 2.5501542773980352, + "learning_rate": 4.345492569959283e-06, + "loss": 0.3686, + "step": 3961 + }, + { + "epoch": 0.5556802244039271, + "grad_norm": 1.980111535716656, + "learning_rate": 4.3432409310530096e-06, + "loss": 0.3881, + "step": 3962 + }, + { + "epoch": 0.555820476858345, + "grad_norm": 1.771262831119859, + "learning_rate": 4.340989427664381e-06, + "loss": 0.3829, + "step": 3963 + }, + { + "epoch": 0.555960729312763, + "grad_norm": 1.5055358776731897, + "learning_rate": 4.338738060257979e-06, + "loss": 0.3202, + "step": 3964 + }, + { + "epoch": 0.5561009817671809, + "grad_norm": 1.5640431222849003, + "learning_rate": 4.336486829298359e-06, + "loss": 0.3532, + "step": 3965 + }, + { + "epoch": 0.5562412342215989, + "grad_norm": 1.8157504302671275, + "learning_rate": 4.334235735250047e-06, + "loss": 0.3442, + "step": 3966 + }, + { + "epoch": 0.5563814866760168, + "grad_norm": 3.753603538986645, + "learning_rate": 4.331984778577539e-06, + "loss": 0.3288, + "step": 3967 + }, + { + "epoch": 0.5565217391304348, + "grad_norm": 1.8673389325523846, + "learning_rate": 4.3297339597453046e-06, + "loss": 0.3055, + "step": 3968 + }, + { + "epoch": 0.5566619915848527, + "grad_norm": 2.63599246724721, + "learning_rate": 4.3274832792177845e-06, + "loss": 0.4148, + "step": 3969 + }, + { + "epoch": 0.5568022440392707, + "grad_norm": 2.0529265889952, + "learning_rate": 4.325232737459391e-06, + "loss": 0.3447, + "step": 3970 + }, + { + "epoch": 0.5569424964936887, + "grad_norm": 2.863405316047186, + "learning_rate": 4.322982334934509e-06, + "loss": 0.3627, + "step": 3971 + }, + { + "epoch": 0.5570827489481066, + "grad_norm": 7.75094008127754, + "learning_rate": 4.320732072107491e-06, + "loss": 0.3467, + "step": 3972 + }, + { + "epoch": 0.5572230014025246, + "grad_norm": 2.0167040634891387, + "learning_rate": 4.318481949442665e-06, + "loss": 0.3709, + "step": 3973 + }, + { + "epoch": 0.5573632538569425, + "grad_norm": 2.9767971797352577, + "learning_rate": 4.316231967404326e-06, + "loss": 0.3656, + "step": 3974 + }, + { + "epoch": 0.5575035063113605, + "grad_norm": 2.058524259085654, + "learning_rate": 4.313982126456747e-06, + "loss": 0.3359, + "step": 3975 + }, + { + "epoch": 0.5576437587657784, + "grad_norm": 1.7439150605646927, + "learning_rate": 4.31173242706416e-06, + "loss": 0.3943, + "step": 3976 + }, + { + "epoch": 0.5577840112201964, + "grad_norm": 1.974679826787042, + "learning_rate": 4.309482869690779e-06, + "loss": 0.3596, + "step": 3977 + }, + { + "epoch": 0.5579242636746143, + "grad_norm": 2.6952937471041465, + "learning_rate": 4.307233454800783e-06, + "loss": 0.3331, + "step": 3978 + }, + { + "epoch": 0.5580645161290323, + "grad_norm": 1.5705695250338612, + "learning_rate": 4.304984182858324e-06, + "loss": 0.3316, + "step": 3979 + }, + { + "epoch": 0.5582047685834503, + "grad_norm": 2.4750063847931476, + "learning_rate": 4.302735054327523e-06, + "loss": 0.3354, + "step": 3980 + }, + { + "epoch": 0.5583450210378682, + "grad_norm": 1.8427319059558782, + "learning_rate": 4.300486069672471e-06, + "loss": 0.3303, + "step": 3981 + }, + { + "epoch": 0.5584852734922862, + "grad_norm": 2.2387593609195107, + "learning_rate": 4.298237229357233e-06, + "loss": 0.3516, + "step": 3982 + }, + { + "epoch": 0.5586255259467041, + "grad_norm": 1.9649249869629832, + "learning_rate": 4.2959885338458385e-06, + "loss": 0.3727, + "step": 3983 + }, + { + "epoch": 0.5587657784011221, + "grad_norm": 1.9914933734451215, + "learning_rate": 4.293739983602292e-06, + "loss": 0.3451, + "step": 3984 + }, + { + "epoch": 0.55890603085554, + "grad_norm": 1.8368202846668076, + "learning_rate": 4.291491579090565e-06, + "loss": 0.3055, + "step": 3985 + }, + { + "epoch": 0.5590462833099579, + "grad_norm": 1.9741925894872447, + "learning_rate": 4.289243320774601e-06, + "loss": 0.355, + "step": 3986 + }, + { + "epoch": 0.5591865357643758, + "grad_norm": 2.2796349229822037, + "learning_rate": 4.286995209118313e-06, + "loss": 0.3685, + "step": 3987 + }, + { + "epoch": 0.5593267882187938, + "grad_norm": 2.42933392175206, + "learning_rate": 4.284747244585581e-06, + "loss": 0.3602, + "step": 3988 + }, + { + "epoch": 0.5594670406732117, + "grad_norm": 2.359462509048195, + "learning_rate": 4.282499427640258e-06, + "loss": 0.3937, + "step": 3989 + }, + { + "epoch": 0.5596072931276297, + "grad_norm": 2.4346362188043242, + "learning_rate": 4.280251758746165e-06, + "loss": 0.407, + "step": 3990 + }, + { + "epoch": 0.5597475455820476, + "grad_norm": 1.9350033332736856, + "learning_rate": 4.278004238367093e-06, + "loss": 0.349, + "step": 3991 + }, + { + "epoch": 0.5598877980364656, + "grad_norm": 2.212681490248612, + "learning_rate": 4.275756866966804e-06, + "loss": 0.3662, + "step": 3992 + }, + { + "epoch": 0.5600280504908836, + "grad_norm": 1.782911763616484, + "learning_rate": 4.273509645009023e-06, + "loss": 0.3814, + "step": 3993 + }, + { + "epoch": 0.5601683029453015, + "grad_norm": 2.508265754406132, + "learning_rate": 4.271262572957453e-06, + "loss": 0.3607, + "step": 3994 + }, + { + "epoch": 0.5603085553997195, + "grad_norm": 2.0931657573824434, + "learning_rate": 4.269015651275761e-06, + "loss": 0.3543, + "step": 3995 + }, + { + "epoch": 0.5604488078541374, + "grad_norm": 2.481147829349639, + "learning_rate": 4.26676888042758e-06, + "loss": 0.3574, + "step": 3996 + }, + { + "epoch": 0.5605890603085554, + "grad_norm": 1.5637285361886657, + "learning_rate": 4.264522260876518e-06, + "loss": 0.3703, + "step": 3997 + }, + { + "epoch": 0.5607293127629733, + "grad_norm": 2.0011552811842517, + "learning_rate": 4.262275793086149e-06, + "loss": 0.3532, + "step": 3998 + }, + { + "epoch": 0.5608695652173913, + "grad_norm": 4.584021198789568, + "learning_rate": 4.260029477520016e-06, + "loss": 0.3791, + "step": 3999 + }, + { + "epoch": 0.5610098176718092, + "grad_norm": 2.3692501134016046, + "learning_rate": 4.25778331464163e-06, + "loss": 0.3675, + "step": 4000 + }, + { + "epoch": 0.5611500701262272, + "grad_norm": 1.995452419504221, + "learning_rate": 4.255537304914472e-06, + "loss": 0.3515, + "step": 4001 + }, + { + "epoch": 0.5612903225806452, + "grad_norm": 1.9120007056430488, + "learning_rate": 4.253291448801989e-06, + "loss": 0.3505, + "step": 4002 + }, + { + "epoch": 0.5614305750350631, + "grad_norm": 2.0482409629552576, + "learning_rate": 4.251045746767601e-06, + "loss": 0.3517, + "step": 4003 + }, + { + "epoch": 0.5615708274894811, + "grad_norm": 3.569560766901653, + "learning_rate": 4.248800199274689e-06, + "loss": 0.3418, + "step": 4004 + }, + { + "epoch": 0.561711079943899, + "grad_norm": 1.836702163212503, + "learning_rate": 4.246554806786607e-06, + "loss": 0.3388, + "step": 4005 + }, + { + "epoch": 0.561851332398317, + "grad_norm": 2.824366479916942, + "learning_rate": 4.244309569766677e-06, + "loss": 0.4082, + "step": 4006 + }, + { + "epoch": 0.5619915848527349, + "grad_norm": 3.0586480029910557, + "learning_rate": 4.242064488678188e-06, + "loss": 0.3457, + "step": 4007 + }, + { + "epoch": 0.5621318373071529, + "grad_norm": 2.000085471234275, + "learning_rate": 4.239819563984397e-06, + "loss": 0.3829, + "step": 4008 + }, + { + "epoch": 0.5622720897615708, + "grad_norm": 1.816690139735518, + "learning_rate": 4.237574796148527e-06, + "loss": 0.3454, + "step": 4009 + }, + { + "epoch": 0.5624123422159888, + "grad_norm": 2.294144960747169, + "learning_rate": 4.23533018563377e-06, + "loss": 0.3405, + "step": 4010 + }, + { + "epoch": 0.5625525946704067, + "grad_norm": 2.0517414279376136, + "learning_rate": 4.233085732903288e-06, + "loss": 0.3738, + "step": 4011 + }, + { + "epoch": 0.5626928471248247, + "grad_norm": 2.931601100301933, + "learning_rate": 4.230841438420209e-06, + "loss": 0.3538, + "step": 4012 + }, + { + "epoch": 0.5628330995792427, + "grad_norm": 2.27883365111371, + "learning_rate": 4.228597302647622e-06, + "loss": 0.3612, + "step": 4013 + }, + { + "epoch": 0.5629733520336606, + "grad_norm": 3.1511775888599494, + "learning_rate": 4.226353326048594e-06, + "loss": 0.3501, + "step": 4014 + }, + { + "epoch": 0.5631136044880786, + "grad_norm": 1.6144590350471384, + "learning_rate": 4.224109509086151e-06, + "loss": 0.3443, + "step": 4015 + }, + { + "epoch": 0.5632538569424965, + "grad_norm": 1.890248221693349, + "learning_rate": 4.221865852223293e-06, + "loss": 0.3369, + "step": 4016 + }, + { + "epoch": 0.5633941093969145, + "grad_norm": 2.230133726275793, + "learning_rate": 4.219622355922976e-06, + "loss": 0.3472, + "step": 4017 + }, + { + "epoch": 0.5635343618513324, + "grad_norm": 1.796377760047483, + "learning_rate": 4.217379020648135e-06, + "loss": 0.392, + "step": 4018 + }, + { + "epoch": 0.5636746143057504, + "grad_norm": 1.6747253221218918, + "learning_rate": 4.2151358468616675e-06, + "loss": 0.3612, + "step": 4019 + }, + { + "epoch": 0.5638148667601683, + "grad_norm": 2.7354067354586107, + "learning_rate": 4.212892835026432e-06, + "loss": 0.3715, + "step": 4020 + }, + { + "epoch": 0.5639551192145863, + "grad_norm": 5.220324067619909, + "learning_rate": 4.2106499856052604e-06, + "loss": 0.3983, + "step": 4021 + }, + { + "epoch": 0.5640953716690043, + "grad_norm": 1.9543880193585925, + "learning_rate": 4.2084072990609505e-06, + "loss": 0.3922, + "step": 4022 + }, + { + "epoch": 0.5642356241234222, + "grad_norm": 2.089358949134604, + "learning_rate": 4.206164775856265e-06, + "loss": 0.3637, + "step": 4023 + }, + { + "epoch": 0.5643758765778402, + "grad_norm": 3.107859640018066, + "learning_rate": 4.2039224164539306e-06, + "loss": 0.3743, + "step": 4024 + }, + { + "epoch": 0.5645161290322581, + "grad_norm": 2.259329104979652, + "learning_rate": 4.201680221316643e-06, + "loss": 0.359, + "step": 4025 + }, + { + "epoch": 0.564656381486676, + "grad_norm": 2.7861196341806282, + "learning_rate": 4.1994381909070645e-06, + "loss": 0.3375, + "step": 4026 + }, + { + "epoch": 0.5647966339410939, + "grad_norm": 1.9161412997933962, + "learning_rate": 4.1971963256878224e-06, + "loss": 0.3395, + "step": 4027 + }, + { + "epoch": 0.5649368863955119, + "grad_norm": 1.8622376660437714, + "learning_rate": 4.194954626121511e-06, + "loss": 0.3879, + "step": 4028 + }, + { + "epoch": 0.5650771388499298, + "grad_norm": 1.9647463097499447, + "learning_rate": 4.192713092670687e-06, + "loss": 0.3, + "step": 4029 + }, + { + "epoch": 0.5652173913043478, + "grad_norm": 2.1680854284832543, + "learning_rate": 4.190471725797875e-06, + "loss": 0.4018, + "step": 4030 + }, + { + "epoch": 0.5653576437587657, + "grad_norm": 2.4319991617264054, + "learning_rate": 4.188230525965567e-06, + "loss": 0.3962, + "step": 4031 + }, + { + "epoch": 0.5654978962131837, + "grad_norm": 2.7505121621128428, + "learning_rate": 4.185989493636219e-06, + "loss": 0.3428, + "step": 4032 + }, + { + "epoch": 0.5656381486676016, + "grad_norm": 2.3025213892105794, + "learning_rate": 4.183748629272254e-06, + "loss": 0.3309, + "step": 4033 + }, + { + "epoch": 0.5657784011220196, + "grad_norm": 2.0003605755972007, + "learning_rate": 4.181507933336054e-06, + "loss": 0.3762, + "step": 4034 + }, + { + "epoch": 0.5659186535764376, + "grad_norm": 2.7179611640159838, + "learning_rate": 4.179267406289974e-06, + "loss": 0.3772, + "step": 4035 + }, + { + "epoch": 0.5660589060308555, + "grad_norm": 2.1792011333630676, + "learning_rate": 4.17702704859633e-06, + "loss": 0.3388, + "step": 4036 + }, + { + "epoch": 0.5661991584852735, + "grad_norm": 2.1503614654088468, + "learning_rate": 4.174786860717408e-06, + "loss": 0.3605, + "step": 4037 + }, + { + "epoch": 0.5663394109396914, + "grad_norm": 1.9815515571488207, + "learning_rate": 4.172546843115449e-06, + "loss": 0.3638, + "step": 4038 + }, + { + "epoch": 0.5664796633941094, + "grad_norm": 2.500379342862865, + "learning_rate": 4.170306996252669e-06, + "loss": 0.3447, + "step": 4039 + }, + { + "epoch": 0.5666199158485273, + "grad_norm": 1.7325283908031537, + "learning_rate": 4.1680673205912425e-06, + "loss": 0.3438, + "step": 4040 + }, + { + "epoch": 0.5667601683029453, + "grad_norm": 2.233380182923048, + "learning_rate": 4.165827816593312e-06, + "loss": 0.3471, + "step": 4041 + }, + { + "epoch": 0.5669004207573632, + "grad_norm": 8.051422789516575, + "learning_rate": 4.163588484720984e-06, + "loss": 0.3638, + "step": 4042 + }, + { + "epoch": 0.5670406732117812, + "grad_norm": 2.3848292086885925, + "learning_rate": 4.161349325436328e-06, + "loss": 0.3537, + "step": 4043 + }, + { + "epoch": 0.5671809256661992, + "grad_norm": 2.030663625737846, + "learning_rate": 4.159110339201381e-06, + "loss": 0.3039, + "step": 4044 + }, + { + "epoch": 0.5673211781206171, + "grad_norm": 2.183869864761213, + "learning_rate": 4.156871526478139e-06, + "loss": 0.3543, + "step": 4045 + }, + { + "epoch": 0.5674614305750351, + "grad_norm": 4.95938898383839, + "learning_rate": 4.1546328877285665e-06, + "loss": 0.3324, + "step": 4046 + }, + { + "epoch": 0.567601683029453, + "grad_norm": 2.3391394470646083, + "learning_rate": 4.152394423414593e-06, + "loss": 0.3408, + "step": 4047 + }, + { + "epoch": 0.567741935483871, + "grad_norm": 1.9948738685450287, + "learning_rate": 4.1501561339981086e-06, + "loss": 0.3632, + "step": 4048 + }, + { + "epoch": 0.5678821879382889, + "grad_norm": 1.9703244165820992, + "learning_rate": 4.147918019940967e-06, + "loss": 0.3855, + "step": 4049 + }, + { + "epoch": 0.5680224403927069, + "grad_norm": 2.760256022451316, + "learning_rate": 4.145680081704989e-06, + "loss": 0.4089, + "step": 4050 + }, + { + "epoch": 0.5681626928471248, + "grad_norm": 2.311451442227457, + "learning_rate": 4.143442319751958e-06, + "loss": 0.3402, + "step": 4051 + }, + { + "epoch": 0.5683029453015428, + "grad_norm": 2.402487451240722, + "learning_rate": 4.14120473454362e-06, + "loss": 0.3526, + "step": 4052 + }, + { + "epoch": 0.5684431977559607, + "grad_norm": 2.042516596340329, + "learning_rate": 4.138967326541685e-06, + "loss": 0.3308, + "step": 4053 + }, + { + "epoch": 0.5685834502103787, + "grad_norm": 2.4331368741545063, + "learning_rate": 4.136730096207827e-06, + "loss": 0.3503, + "step": 4054 + }, + { + "epoch": 0.5687237026647967, + "grad_norm": 2.192411848186068, + "learning_rate": 4.134493044003681e-06, + "loss": 0.349, + "step": 4055 + }, + { + "epoch": 0.5688639551192146, + "grad_norm": 2.7949759573405664, + "learning_rate": 4.132256170390848e-06, + "loss": 0.3555, + "step": 4056 + }, + { + "epoch": 0.5690042075736326, + "grad_norm": 2.2904647333927173, + "learning_rate": 4.1300194758308935e-06, + "loss": 0.4011, + "step": 4057 + }, + { + "epoch": 0.5691444600280505, + "grad_norm": 4.207944921314374, + "learning_rate": 4.127782960785344e-06, + "loss": 0.3558, + "step": 4058 + }, + { + "epoch": 0.5692847124824685, + "grad_norm": 2.1428089000175983, + "learning_rate": 4.125546625715683e-06, + "loss": 0.368, + "step": 4059 + }, + { + "epoch": 0.5694249649368864, + "grad_norm": 1.7858487378320997, + "learning_rate": 4.123310471083368e-06, + "loss": 0.3317, + "step": 4060 + }, + { + "epoch": 0.5695652173913044, + "grad_norm": 2.350428310889083, + "learning_rate": 4.121074497349811e-06, + "loss": 0.3393, + "step": 4061 + }, + { + "epoch": 0.5697054698457223, + "grad_norm": 3.1231477970423382, + "learning_rate": 4.118838704976392e-06, + "loss": 0.3643, + "step": 4062 + }, + { + "epoch": 0.5698457223001403, + "grad_norm": 2.4124420554362342, + "learning_rate": 4.116603094424449e-06, + "loss": 0.3804, + "step": 4063 + }, + { + "epoch": 0.5699859747545583, + "grad_norm": 1.7870402671918004, + "learning_rate": 4.1143676661552876e-06, + "loss": 0.38, + "step": 4064 + }, + { + "epoch": 0.5701262272089762, + "grad_norm": 2.6742602849660737, + "learning_rate": 4.112132420630169e-06, + "loss": 0.325, + "step": 4065 + }, + { + "epoch": 0.570266479663394, + "grad_norm": 2.3125673232717587, + "learning_rate": 4.1098973583103226e-06, + "loss": 0.3332, + "step": 4066 + }, + { + "epoch": 0.570406732117812, + "grad_norm": 2.7874232157579404, + "learning_rate": 4.107662479656937e-06, + "loss": 0.3422, + "step": 4067 + }, + { + "epoch": 0.57054698457223, + "grad_norm": 2.1500779426108623, + "learning_rate": 4.105427785131165e-06, + "loss": 0.3659, + "step": 4068 + }, + { + "epoch": 0.5706872370266479, + "grad_norm": 2.408367760840079, + "learning_rate": 4.10319327519412e-06, + "loss": 0.3454, + "step": 4069 + }, + { + "epoch": 0.5708274894810659, + "grad_norm": 2.678832693528013, + "learning_rate": 4.1009589503068755e-06, + "loss": 0.3558, + "step": 4070 + }, + { + "epoch": 0.5709677419354838, + "grad_norm": 1.825368384191693, + "learning_rate": 4.098724810930472e-06, + "loss": 0.3035, + "step": 4071 + }, + { + "epoch": 0.5711079943899018, + "grad_norm": 2.283628817278272, + "learning_rate": 4.096490857525906e-06, + "loss": 0.3396, + "step": 4072 + }, + { + "epoch": 0.5712482468443197, + "grad_norm": 1.9964474478515783, + "learning_rate": 4.094257090554139e-06, + "loss": 0.3428, + "step": 4073 + }, + { + "epoch": 0.5713884992987377, + "grad_norm": 1.807067622681657, + "learning_rate": 4.092023510476095e-06, + "loss": 0.344, + "step": 4074 + }, + { + "epoch": 0.5715287517531557, + "grad_norm": 2.518016839108204, + "learning_rate": 4.089790117752655e-06, + "loss": 0.3669, + "step": 4075 + }, + { + "epoch": 0.5716690042075736, + "grad_norm": 2.0935175752450963, + "learning_rate": 4.087556912844664e-06, + "loss": 0.3312, + "step": 4076 + }, + { + "epoch": 0.5718092566619916, + "grad_norm": 2.089938894227294, + "learning_rate": 4.08532389621293e-06, + "loss": 0.3378, + "step": 4077 + }, + { + "epoch": 0.5719495091164095, + "grad_norm": 1.8908652791327984, + "learning_rate": 4.08309106831822e-06, + "loss": 0.3357, + "step": 4078 + }, + { + "epoch": 0.5720897615708275, + "grad_norm": 3.4453951248330204, + "learning_rate": 4.080858429621262e-06, + "loss": 0.3927, + "step": 4079 + }, + { + "epoch": 0.5722300140252454, + "grad_norm": 1.8776968317185028, + "learning_rate": 4.078625980582746e-06, + "loss": 0.3829, + "step": 4080 + }, + { + "epoch": 0.5723702664796634, + "grad_norm": 1.9442539015764289, + "learning_rate": 4.076393721663321e-06, + "loss": 0.3639, + "step": 4081 + }, + { + "epoch": 0.5725105189340813, + "grad_norm": 1.8928874326991776, + "learning_rate": 4.0741616533235975e-06, + "loss": 0.3402, + "step": 4082 + }, + { + "epoch": 0.5726507713884993, + "grad_norm": 2.9785241098752095, + "learning_rate": 4.071929776024149e-06, + "loss": 0.3626, + "step": 4083 + }, + { + "epoch": 0.5727910238429172, + "grad_norm": 2.0957183695129373, + "learning_rate": 4.069698090225508e-06, + "loss": 0.3352, + "step": 4084 + }, + { + "epoch": 0.5729312762973352, + "grad_norm": 2.9927657641804326, + "learning_rate": 4.067466596388166e-06, + "loss": 0.3503, + "step": 4085 + }, + { + "epoch": 0.5730715287517532, + "grad_norm": 2.314194032999261, + "learning_rate": 4.065235294972577e-06, + "loss": 0.3065, + "step": 4086 + }, + { + "epoch": 0.5732117812061711, + "grad_norm": 2.37091596713029, + "learning_rate": 4.063004186439153e-06, + "loss": 0.2762, + "step": 4087 + }, + { + "epoch": 0.5733520336605891, + "grad_norm": 2.537302951020786, + "learning_rate": 4.06077327124827e-06, + "loss": 0.3668, + "step": 4088 + }, + { + "epoch": 0.573492286115007, + "grad_norm": 3.281149871751669, + "learning_rate": 4.0585425498602605e-06, + "loss": 0.3309, + "step": 4089 + }, + { + "epoch": 0.573632538569425, + "grad_norm": 1.9202906583284154, + "learning_rate": 4.056312022735417e-06, + "loss": 0.3415, + "step": 4090 + }, + { + "epoch": 0.5737727910238429, + "grad_norm": 2.0862442533986445, + "learning_rate": 4.054081690333995e-06, + "loss": 0.3544, + "step": 4091 + }, + { + "epoch": 0.5739130434782609, + "grad_norm": 1.964025914982669, + "learning_rate": 4.051851553116208e-06, + "loss": 0.3419, + "step": 4092 + }, + { + "epoch": 0.5740532959326788, + "grad_norm": 2.2269258972227144, + "learning_rate": 4.049621611542228e-06, + "loss": 0.3648, + "step": 4093 + }, + { + "epoch": 0.5741935483870968, + "grad_norm": 2.4151050081978958, + "learning_rate": 4.04739186607219e-06, + "loss": 0.3309, + "step": 4094 + }, + { + "epoch": 0.5743338008415148, + "grad_norm": 2.4053281529251667, + "learning_rate": 4.045162317166184e-06, + "loss": 0.4001, + "step": 4095 + }, + { + "epoch": 0.5744740532959327, + "grad_norm": 1.9920034884571645, + "learning_rate": 4.0429329652842625e-06, + "loss": 0.3321, + "step": 4096 + }, + { + "epoch": 0.5746143057503507, + "grad_norm": 3.5364054627696375, + "learning_rate": 4.040703810886437e-06, + "loss": 0.3325, + "step": 4097 + }, + { + "epoch": 0.5747545582047686, + "grad_norm": 1.6961010623951722, + "learning_rate": 4.038474854432679e-06, + "loss": 0.3669, + "step": 4098 + }, + { + "epoch": 0.5748948106591866, + "grad_norm": 2.2859871160789926, + "learning_rate": 4.036246096382916e-06, + "loss": 0.329, + "step": 4099 + }, + { + "epoch": 0.5750350631136045, + "grad_norm": 2.257150519980587, + "learning_rate": 4.03401753719704e-06, + "loss": 0.4075, + "step": 4100 + }, + { + "epoch": 0.5751753155680225, + "grad_norm": 2.83911022593347, + "learning_rate": 4.031789177334895e-06, + "loss": 0.321, + "step": 4101 + }, + { + "epoch": 0.5753155680224404, + "grad_norm": 2.415466295484493, + "learning_rate": 4.029561017256288e-06, + "loss": 0.3603, + "step": 4102 + }, + { + "epoch": 0.5754558204768584, + "grad_norm": 2.6474540932685415, + "learning_rate": 4.027333057420985e-06, + "loss": 0.3543, + "step": 4103 + }, + { + "epoch": 0.5755960729312763, + "grad_norm": 2.710192487292419, + "learning_rate": 4.0251052982887105e-06, + "loss": 0.3019, + "step": 4104 + }, + { + "epoch": 0.5757363253856943, + "grad_norm": 1.9153099510411178, + "learning_rate": 4.022877740319147e-06, + "loss": 0.3537, + "step": 4105 + }, + { + "epoch": 0.5758765778401121, + "grad_norm": 2.413161915316689, + "learning_rate": 4.0206503839719335e-06, + "loss": 0.3565, + "step": 4106 + }, + { + "epoch": 0.5760168302945301, + "grad_norm": 2.1596803882577276, + "learning_rate": 4.018423229706672e-06, + "loss": 0.3405, + "step": 4107 + }, + { + "epoch": 0.5761570827489481, + "grad_norm": 3.2498479954312023, + "learning_rate": 4.016196277982919e-06, + "loss": 0.4166, + "step": 4108 + }, + { + "epoch": 0.576297335203366, + "grad_norm": 5.20678270781313, + "learning_rate": 4.013969529260191e-06, + "loss": 0.3417, + "step": 4109 + }, + { + "epoch": 0.576437587657784, + "grad_norm": 2.3844484810361957, + "learning_rate": 4.011742983997961e-06, + "loss": 0.3696, + "step": 4110 + }, + { + "epoch": 0.5765778401122019, + "grad_norm": 4.76942967362148, + "learning_rate": 4.009516642655662e-06, + "loss": 0.369, + "step": 4111 + }, + { + "epoch": 0.5767180925666199, + "grad_norm": 1.697423789289751, + "learning_rate": 4.007290505692684e-06, + "loss": 0.3553, + "step": 4112 + }, + { + "epoch": 0.5768583450210378, + "grad_norm": 3.95416523511651, + "learning_rate": 4.0050645735683745e-06, + "loss": 0.3909, + "step": 4113 + }, + { + "epoch": 0.5769985974754558, + "grad_norm": 2.139558512522974, + "learning_rate": 4.002838846742039e-06, + "loss": 0.3137, + "step": 4114 + }, + { + "epoch": 0.5771388499298737, + "grad_norm": 2.137558236918246, + "learning_rate": 4.000613325672942e-06, + "loss": 0.3775, + "step": 4115 + }, + { + "epoch": 0.5772791023842917, + "grad_norm": 2.6642570536433197, + "learning_rate": 3.998388010820301e-06, + "loss": 0.3336, + "step": 4116 + }, + { + "epoch": 0.5774193548387097, + "grad_norm": 2.0161749510919544, + "learning_rate": 3.996162902643296e-06, + "loss": 0.3749, + "step": 4117 + }, + { + "epoch": 0.5775596072931276, + "grad_norm": 2.49786867843826, + "learning_rate": 3.993938001601064e-06, + "loss": 0.3729, + "step": 4118 + }, + { + "epoch": 0.5776998597475456, + "grad_norm": 2.08646720418379, + "learning_rate": 3.991713308152696e-06, + "loss": 0.3339, + "step": 4119 + }, + { + "epoch": 0.5778401122019635, + "grad_norm": 2.3520681654984763, + "learning_rate": 3.989488822757244e-06, + "loss": 0.3286, + "step": 4120 + }, + { + "epoch": 0.5779803646563815, + "grad_norm": 1.8700974904111196, + "learning_rate": 3.987264545873712e-06, + "loss": 0.3962, + "step": 4121 + }, + { + "epoch": 0.5781206171107994, + "grad_norm": 2.6825891027379534, + "learning_rate": 3.985040477961066e-06, + "loss": 0.3333, + "step": 4122 + }, + { + "epoch": 0.5782608695652174, + "grad_norm": 2.5669499348304448, + "learning_rate": 3.982816619478225e-06, + "loss": 0.361, + "step": 4123 + }, + { + "epoch": 0.5784011220196353, + "grad_norm": 3.6099286335999854, + "learning_rate": 3.980592970884069e-06, + "loss": 0.3846, + "step": 4124 + }, + { + "epoch": 0.5785413744740533, + "grad_norm": 2.451039490791346, + "learning_rate": 3.97836953263743e-06, + "loss": 0.3346, + "step": 4125 + }, + { + "epoch": 0.5786816269284712, + "grad_norm": 2.0278558396710644, + "learning_rate": 3.976146305197102e-06, + "loss": 0.3454, + "step": 4126 + }, + { + "epoch": 0.5788218793828892, + "grad_norm": 2.512018307491578, + "learning_rate": 3.973923289021829e-06, + "loss": 0.3333, + "step": 4127 + }, + { + "epoch": 0.5789621318373072, + "grad_norm": 3.0108364212973355, + "learning_rate": 3.9717004845703175e-06, + "loss": 0.384, + "step": 4128 + }, + { + "epoch": 0.5791023842917251, + "grad_norm": 1.7171409704761555, + "learning_rate": 3.969477892301227e-06, + "loss": 0.3382, + "step": 4129 + }, + { + "epoch": 0.5792426367461431, + "grad_norm": 1.9232288337275685, + "learning_rate": 3.967255512673174e-06, + "loss": 0.3725, + "step": 4130 + }, + { + "epoch": 0.579382889200561, + "grad_norm": 2.1859354145948102, + "learning_rate": 3.96503334614473e-06, + "loss": 0.3182, + "step": 4131 + }, + { + "epoch": 0.579523141654979, + "grad_norm": 1.6894634268562354, + "learning_rate": 3.962811393174423e-06, + "loss": 0.3196, + "step": 4132 + }, + { + "epoch": 0.5796633941093969, + "grad_norm": 2.491464826984387, + "learning_rate": 3.96058965422074e-06, + "loss": 0.3203, + "step": 4133 + }, + { + "epoch": 0.5798036465638149, + "grad_norm": 1.875576312187306, + "learning_rate": 3.9583681297421194e-06, + "loss": 0.3811, + "step": 4134 + }, + { + "epoch": 0.5799438990182328, + "grad_norm": 2.800764480178831, + "learning_rate": 3.956146820196959e-06, + "loss": 0.3523, + "step": 4135 + }, + { + "epoch": 0.5800841514726508, + "grad_norm": 1.9469945904539534, + "learning_rate": 3.9539257260436085e-06, + "loss": 0.3401, + "step": 4136 + }, + { + "epoch": 0.5802244039270688, + "grad_norm": 1.7060160466028682, + "learning_rate": 3.9517048477403755e-06, + "loss": 0.3508, + "step": 4137 + }, + { + "epoch": 0.5803646563814867, + "grad_norm": 1.7768616410221871, + "learning_rate": 3.949484185745523e-06, + "loss": 0.3836, + "step": 4138 + }, + { + "epoch": 0.5805049088359047, + "grad_norm": 2.009590282150025, + "learning_rate": 3.94726374051727e-06, + "loss": 0.3463, + "step": 4139 + }, + { + "epoch": 0.5806451612903226, + "grad_norm": 2.384161995389065, + "learning_rate": 3.94504351251379e-06, + "loss": 0.3441, + "step": 4140 + }, + { + "epoch": 0.5807854137447406, + "grad_norm": 2.250092292588194, + "learning_rate": 3.9428235021932104e-06, + "loss": 0.3215, + "step": 4141 + }, + { + "epoch": 0.5809256661991585, + "grad_norm": 2.057154461756021, + "learning_rate": 3.940603710013615e-06, + "loss": 0.3749, + "step": 4142 + }, + { + "epoch": 0.5810659186535765, + "grad_norm": 2.2480537686213267, + "learning_rate": 3.9383841364330425e-06, + "loss": 0.3554, + "step": 4143 + }, + { + "epoch": 0.5812061711079944, + "grad_norm": 2.984949186521969, + "learning_rate": 3.936164781909485e-06, + "loss": 0.3668, + "step": 4144 + }, + { + "epoch": 0.5813464235624124, + "grad_norm": 1.8679294787865324, + "learning_rate": 3.933945646900893e-06, + "loss": 0.3709, + "step": 4145 + }, + { + "epoch": 0.5814866760168302, + "grad_norm": 1.750664380525044, + "learning_rate": 3.93172673186517e-06, + "loss": 0.3513, + "step": 4146 + }, + { + "epoch": 0.5816269284712482, + "grad_norm": 2.297870238715008, + "learning_rate": 3.92950803726017e-06, + "loss": 0.3583, + "step": 4147 + }, + { + "epoch": 0.5817671809256661, + "grad_norm": 1.8358281529105722, + "learning_rate": 3.927289563543709e-06, + "loss": 0.3554, + "step": 4148 + }, + { + "epoch": 0.5819074333800841, + "grad_norm": 2.239116413029075, + "learning_rate": 3.925071311173551e-06, + "loss": 0.3297, + "step": 4149 + }, + { + "epoch": 0.5820476858345021, + "grad_norm": 2.5718902105389176, + "learning_rate": 3.9228532806074184e-06, + "loss": 0.3594, + "step": 4150 + }, + { + "epoch": 0.58218793828892, + "grad_norm": 2.0238983925506586, + "learning_rate": 3.920635472302986e-06, + "loss": 0.387, + "step": 4151 + }, + { + "epoch": 0.582328190743338, + "grad_norm": 1.830865419905645, + "learning_rate": 3.918417886717884e-06, + "loss": 0.3827, + "step": 4152 + }, + { + "epoch": 0.5824684431977559, + "grad_norm": 1.8999162730792385, + "learning_rate": 3.916200524309693e-06, + "loss": 0.3711, + "step": 4153 + }, + { + "epoch": 0.5826086956521739, + "grad_norm": 1.771343322319912, + "learning_rate": 3.913983385535951e-06, + "loss": 0.3397, + "step": 4154 + }, + { + "epoch": 0.5827489481065918, + "grad_norm": 2.2062049061432676, + "learning_rate": 3.911766470854152e-06, + "loss": 0.4011, + "step": 4155 + }, + { + "epoch": 0.5828892005610098, + "grad_norm": 2.2108300927698354, + "learning_rate": 3.9095497807217375e-06, + "loss": 0.4168, + "step": 4156 + }, + { + "epoch": 0.5830294530154277, + "grad_norm": 2.0882882006437122, + "learning_rate": 3.907333315596107e-06, + "loss": 0.3646, + "step": 4157 + }, + { + "epoch": 0.5831697054698457, + "grad_norm": 2.041904850774565, + "learning_rate": 3.905117075934613e-06, + "loss": 0.3221, + "step": 4158 + }, + { + "epoch": 0.5833099579242637, + "grad_norm": 1.7812451145038197, + "learning_rate": 3.902901062194561e-06, + "loss": 0.3229, + "step": 4159 + }, + { + "epoch": 0.5834502103786816, + "grad_norm": 2.1272066664052933, + "learning_rate": 3.900685274833211e-06, + "loss": 0.3354, + "step": 4160 + }, + { + "epoch": 0.5835904628330996, + "grad_norm": 1.7319853532669596, + "learning_rate": 3.898469714307773e-06, + "loss": 0.3773, + "step": 4161 + }, + { + "epoch": 0.5837307152875175, + "grad_norm": 2.294328692453924, + "learning_rate": 3.896254381075416e-06, + "loss": 0.3692, + "step": 4162 + }, + { + "epoch": 0.5838709677419355, + "grad_norm": 2.3797379466449975, + "learning_rate": 3.894039275593253e-06, + "loss": 0.3706, + "step": 4163 + }, + { + "epoch": 0.5840112201963534, + "grad_norm": 2.2372456496602755, + "learning_rate": 3.891824398318359e-06, + "loss": 0.3712, + "step": 4164 + }, + { + "epoch": 0.5841514726507714, + "grad_norm": 2.671888295636921, + "learning_rate": 3.889609749707759e-06, + "loss": 0.3605, + "step": 4165 + }, + { + "epoch": 0.5842917251051893, + "grad_norm": 2.101199888123125, + "learning_rate": 3.887395330218429e-06, + "loss": 0.4009, + "step": 4166 + }, + { + "epoch": 0.5844319775596073, + "grad_norm": 2.753203173771317, + "learning_rate": 3.8851811403073e-06, + "loss": 0.3443, + "step": 4167 + }, + { + "epoch": 0.5845722300140253, + "grad_norm": 2.624668191193143, + "learning_rate": 3.882967180431253e-06, + "loss": 0.3406, + "step": 4168 + }, + { + "epoch": 0.5847124824684432, + "grad_norm": 2.0941259701356043, + "learning_rate": 3.880753451047124e-06, + "loss": 0.3835, + "step": 4169 + }, + { + "epoch": 0.5848527349228612, + "grad_norm": 2.07792444623775, + "learning_rate": 3.8785399526117e-06, + "loss": 0.4415, + "step": 4170 + }, + { + "epoch": 0.5849929873772791, + "grad_norm": 2.7015436210436, + "learning_rate": 3.876326685581724e-06, + "loss": 0.3334, + "step": 4171 + }, + { + "epoch": 0.5851332398316971, + "grad_norm": 2.4641662730617764, + "learning_rate": 3.874113650413884e-06, + "loss": 0.383, + "step": 4172 + }, + { + "epoch": 0.585273492286115, + "grad_norm": 2.2109716837366777, + "learning_rate": 3.8719008475648265e-06, + "loss": 0.3734, + "step": 4173 + }, + { + "epoch": 0.585413744740533, + "grad_norm": 2.9551476711805598, + "learning_rate": 3.869688277491148e-06, + "loss": 0.3484, + "step": 4174 + }, + { + "epoch": 0.5855539971949509, + "grad_norm": 1.9807678839441782, + "learning_rate": 3.867475940649396e-06, + "loss": 0.358, + "step": 4175 + }, + { + "epoch": 0.5856942496493689, + "grad_norm": 1.93026156515909, + "learning_rate": 3.865263837496072e-06, + "loss": 0.3752, + "step": 4176 + }, + { + "epoch": 0.5858345021037868, + "grad_norm": 2.6938512406939856, + "learning_rate": 3.8630519684876264e-06, + "loss": 0.3334, + "step": 4177 + }, + { + "epoch": 0.5859747545582048, + "grad_norm": 1.9554531412267513, + "learning_rate": 3.860840334080463e-06, + "loss": 0.348, + "step": 4178 + }, + { + "epoch": 0.5861150070126228, + "grad_norm": 1.9979749202357044, + "learning_rate": 3.858628934730939e-06, + "loss": 0.3604, + "step": 4179 + }, + { + "epoch": 0.5862552594670407, + "grad_norm": 2.492117566888369, + "learning_rate": 3.8564177708953595e-06, + "loss": 0.3474, + "step": 4180 + }, + { + "epoch": 0.5863955119214587, + "grad_norm": 1.7499045618463536, + "learning_rate": 3.854206843029985e-06, + "loss": 0.3244, + "step": 4181 + }, + { + "epoch": 0.5865357643758766, + "grad_norm": 2.8248339308816983, + "learning_rate": 3.851996151591022e-06, + "loss": 0.3953, + "step": 4182 + }, + { + "epoch": 0.5866760168302946, + "grad_norm": 2.0510990732146728, + "learning_rate": 3.849785697034634e-06, + "loss": 0.3516, + "step": 4183 + }, + { + "epoch": 0.5868162692847125, + "grad_norm": 2.5842645771463886, + "learning_rate": 3.847575479816929e-06, + "loss": 0.3418, + "step": 4184 + }, + { + "epoch": 0.5869565217391305, + "grad_norm": 2.3155732224991037, + "learning_rate": 3.845365500393974e-06, + "loss": 0.3506, + "step": 4185 + }, + { + "epoch": 0.5870967741935483, + "grad_norm": 2.9519515355171384, + "learning_rate": 3.84315575922178e-06, + "loss": 0.3698, + "step": 4186 + }, + { + "epoch": 0.5872370266479663, + "grad_norm": 2.4845983057009717, + "learning_rate": 3.840946256756314e-06, + "loss": 0.4092, + "step": 4187 + }, + { + "epoch": 0.5873772791023842, + "grad_norm": 3.67948339588697, + "learning_rate": 3.838736993453489e-06, + "loss": 0.343, + "step": 4188 + }, + { + "epoch": 0.5875175315568022, + "grad_norm": 2.7732259320437476, + "learning_rate": 3.836527969769172e-06, + "loss": 0.3358, + "step": 4189 + }, + { + "epoch": 0.5876577840112202, + "grad_norm": 2.899068511795467, + "learning_rate": 3.834319186159179e-06, + "loss": 0.3636, + "step": 4190 + }, + { + "epoch": 0.5877980364656381, + "grad_norm": 1.931005051272881, + "learning_rate": 3.83211064307928e-06, + "loss": 0.3908, + "step": 4191 + }, + { + "epoch": 0.5879382889200561, + "grad_norm": 2.257105627580685, + "learning_rate": 3.829902340985189e-06, + "loss": 0.3682, + "step": 4192 + }, + { + "epoch": 0.588078541374474, + "grad_norm": 2.693213488616094, + "learning_rate": 3.827694280332575e-06, + "loss": 0.3605, + "step": 4193 + }, + { + "epoch": 0.588218793828892, + "grad_norm": 2.5910509152814916, + "learning_rate": 3.8254864615770556e-06, + "loss": 0.3623, + "step": 4194 + }, + { + "epoch": 0.5883590462833099, + "grad_norm": 3.725701126577724, + "learning_rate": 3.8232788851742e-06, + "loss": 0.3629, + "step": 4195 + }, + { + "epoch": 0.5884992987377279, + "grad_norm": 1.8032178963940022, + "learning_rate": 3.821071551579525e-06, + "loss": 0.3515, + "step": 4196 + }, + { + "epoch": 0.5886395511921458, + "grad_norm": 3.49545879032298, + "learning_rate": 3.818864461248498e-06, + "loss": 0.382, + "step": 4197 + }, + { + "epoch": 0.5887798036465638, + "grad_norm": 1.9855891440674973, + "learning_rate": 3.816657614636538e-06, + "loss": 0.3487, + "step": 4198 + }, + { + "epoch": 0.5889200561009817, + "grad_norm": 5.877248455523978, + "learning_rate": 3.8144510121990106e-06, + "loss": 0.3456, + "step": 4199 + }, + { + "epoch": 0.5890603085553997, + "grad_norm": 2.6702061845698797, + "learning_rate": 3.812244654391235e-06, + "loss": 0.3363, + "step": 4200 + }, + { + "epoch": 0.5892005610098177, + "grad_norm": 1.8190881106787657, + "learning_rate": 3.810038541668477e-06, + "loss": 0.3409, + "step": 4201 + }, + { + "epoch": 0.5893408134642356, + "grad_norm": 3.192438525964976, + "learning_rate": 3.8078326744859516e-06, + "loss": 0.3674, + "step": 4202 + }, + { + "epoch": 0.5894810659186536, + "grad_norm": 2.1368540942409857, + "learning_rate": 3.805627053298825e-06, + "loss": 0.3639, + "step": 4203 + }, + { + "epoch": 0.5896213183730715, + "grad_norm": 2.170118101571448, + "learning_rate": 3.803421678562213e-06, + "loss": 0.3482, + "step": 4204 + }, + { + "epoch": 0.5897615708274895, + "grad_norm": 2.154160191755568, + "learning_rate": 3.8012165507311756e-06, + "loss": 0.3694, + "step": 4205 + }, + { + "epoch": 0.5899018232819074, + "grad_norm": 1.9468775202702702, + "learning_rate": 3.799011670260727e-06, + "loss": 0.3535, + "step": 4206 + }, + { + "epoch": 0.5900420757363254, + "grad_norm": 2.7769812925517035, + "learning_rate": 3.7968070376058304e-06, + "loss": 0.341, + "step": 4207 + }, + { + "epoch": 0.5901823281907433, + "grad_norm": 2.408261436876393, + "learning_rate": 3.7946026532213965e-06, + "loss": 0.3423, + "step": 4208 + }, + { + "epoch": 0.5903225806451613, + "grad_norm": 2.3279229402864403, + "learning_rate": 3.792398517562282e-06, + "loss": 0.3517, + "step": 4209 + }, + { + "epoch": 0.5904628330995793, + "grad_norm": 2.4121815288423836, + "learning_rate": 3.7901946310832966e-06, + "loss": 0.3555, + "step": 4210 + }, + { + "epoch": 0.5906030855539972, + "grad_norm": 2.052644707693104, + "learning_rate": 3.7879909942391963e-06, + "loss": 0.3513, + "step": 4211 + }, + { + "epoch": 0.5907433380084152, + "grad_norm": 2.631633369987309, + "learning_rate": 3.7857876074846878e-06, + "loss": 0.2984, + "step": 4212 + }, + { + "epoch": 0.5908835904628331, + "grad_norm": 2.072797557789031, + "learning_rate": 3.7835844712744228e-06, + "loss": 0.3642, + "step": 4213 + }, + { + "epoch": 0.5910238429172511, + "grad_norm": 2.409708980068304, + "learning_rate": 3.7813815860630034e-06, + "loss": 0.3531, + "step": 4214 + }, + { + "epoch": 0.591164095371669, + "grad_norm": 2.698151881349362, + "learning_rate": 3.7791789523049793e-06, + "loss": 0.3653, + "step": 4215 + }, + { + "epoch": 0.591304347826087, + "grad_norm": 2.2721115387952597, + "learning_rate": 3.7769765704548494e-06, + "loss": 0.3591, + "step": 4216 + }, + { + "epoch": 0.5914446002805049, + "grad_norm": 1.8682055391452301, + "learning_rate": 3.7747744409670608e-06, + "loss": 0.3683, + "step": 4217 + }, + { + "epoch": 0.5915848527349229, + "grad_norm": 2.0743420810276967, + "learning_rate": 3.7725725642960047e-06, + "loss": 0.3553, + "step": 4218 + }, + { + "epoch": 0.5917251051893408, + "grad_norm": 1.9468333722068214, + "learning_rate": 3.770370940896025e-06, + "loss": 0.3344, + "step": 4219 + }, + { + "epoch": 0.5918653576437588, + "grad_norm": 1.8901671808971614, + "learning_rate": 3.76816957122141e-06, + "loss": 0.3315, + "step": 4220 + }, + { + "epoch": 0.5920056100981768, + "grad_norm": 2.5630499086912333, + "learning_rate": 3.765968455726398e-06, + "loss": 0.3568, + "step": 4221 + }, + { + "epoch": 0.5921458625525947, + "grad_norm": 2.6366259749303174, + "learning_rate": 3.7637675948651754e-06, + "loss": 0.356, + "step": 4222 + }, + { + "epoch": 0.5922861150070127, + "grad_norm": 2.0677242582718445, + "learning_rate": 3.7615669890918706e-06, + "loss": 0.3448, + "step": 4223 + }, + { + "epoch": 0.5924263674614306, + "grad_norm": 2.4100187438615825, + "learning_rate": 3.7593666388605654e-06, + "loss": 0.3103, + "step": 4224 + }, + { + "epoch": 0.5925666199158486, + "grad_norm": 2.843861169004524, + "learning_rate": 3.7571665446252886e-06, + "loss": 0.3897, + "step": 4225 + }, + { + "epoch": 0.5927068723702664, + "grad_norm": 2.3019327543099495, + "learning_rate": 3.7549667068400104e-06, + "loss": 0.3735, + "step": 4226 + }, + { + "epoch": 0.5928471248246844, + "grad_norm": 2.275953545744929, + "learning_rate": 3.7527671259586536e-06, + "loss": 0.3011, + "step": 4227 + }, + { + "epoch": 0.5929873772791023, + "grad_norm": 1.7204963135462241, + "learning_rate": 3.7505678024350874e-06, + "loss": 0.3226, + "step": 4228 + }, + { + "epoch": 0.5931276297335203, + "grad_norm": 3.8261648959864885, + "learning_rate": 3.748368736723125e-06, + "loss": 0.3311, + "step": 4229 + }, + { + "epoch": 0.5932678821879382, + "grad_norm": 4.350846236776729, + "learning_rate": 3.746169929276529e-06, + "loss": 0.347, + "step": 4230 + }, + { + "epoch": 0.5934081346423562, + "grad_norm": 2.436453080929339, + "learning_rate": 3.7439713805490087e-06, + "loss": 0.3529, + "step": 4231 + }, + { + "epoch": 0.5935483870967742, + "grad_norm": 3.4349530581358283, + "learning_rate": 3.7417730909942184e-06, + "loss": 0.3745, + "step": 4232 + }, + { + "epoch": 0.5936886395511921, + "grad_norm": 2.330227510535075, + "learning_rate": 3.739575061065761e-06, + "loss": 0.3097, + "step": 4233 + }, + { + "epoch": 0.5938288920056101, + "grad_norm": 1.7743758353365402, + "learning_rate": 3.7373772912171825e-06, + "loss": 0.3535, + "step": 4234 + }, + { + "epoch": 0.593969144460028, + "grad_norm": 2.196454602927952, + "learning_rate": 3.7351797819019788e-06, + "loss": 0.3359, + "step": 4235 + }, + { + "epoch": 0.594109396914446, + "grad_norm": 2.124044203382685, + "learning_rate": 3.73298253357359e-06, + "loss": 0.347, + "step": 4236 + }, + { + "epoch": 0.5942496493688639, + "grad_norm": 2.5641156197484523, + "learning_rate": 3.7307855466854053e-06, + "loss": 0.3407, + "step": 4237 + }, + { + "epoch": 0.5943899018232819, + "grad_norm": 2.887024983228268, + "learning_rate": 3.728588821690754e-06, + "loss": 0.3302, + "step": 4238 + }, + { + "epoch": 0.5945301542776998, + "grad_norm": 2.1395413516889916, + "learning_rate": 3.726392359042917e-06, + "loss": 0.3748, + "step": 4239 + }, + { + "epoch": 0.5946704067321178, + "grad_norm": 2.0811834559527767, + "learning_rate": 3.7241961591951183e-06, + "loss": 0.3446, + "step": 4240 + }, + { + "epoch": 0.5948106591865358, + "grad_norm": 2.0085811108637004, + "learning_rate": 3.722000222600528e-06, + "loss": 0.3766, + "step": 4241 + }, + { + "epoch": 0.5949509116409537, + "grad_norm": 2.721441230415507, + "learning_rate": 3.7198045497122647e-06, + "loss": 0.3372, + "step": 4242 + }, + { + "epoch": 0.5950911640953717, + "grad_norm": 1.9012920127706099, + "learning_rate": 3.717609140983387e-06, + "loss": 0.3537, + "step": 4243 + }, + { + "epoch": 0.5952314165497896, + "grad_norm": 1.9686418533897232, + "learning_rate": 3.7154139968669043e-06, + "loss": 0.3723, + "step": 4244 + }, + { + "epoch": 0.5953716690042076, + "grad_norm": 2.1407567561456284, + "learning_rate": 3.71321911781577e-06, + "loss": 0.3137, + "step": 4245 + }, + { + "epoch": 0.5955119214586255, + "grad_norm": 10.838285712917159, + "learning_rate": 3.7110245042828786e-06, + "loss": 0.3466, + "step": 4246 + }, + { + "epoch": 0.5956521739130435, + "grad_norm": 2.1101207403918814, + "learning_rate": 3.708830156721075e-06, + "loss": 0.3352, + "step": 4247 + }, + { + "epoch": 0.5957924263674614, + "grad_norm": 2.1787994654036233, + "learning_rate": 3.706636075583148e-06, + "loss": 0.3236, + "step": 4248 + }, + { + "epoch": 0.5959326788218794, + "grad_norm": 1.9806009282352337, + "learning_rate": 3.7044422613218322e-06, + "loss": 0.3916, + "step": 4249 + }, + { + "epoch": 0.5960729312762973, + "grad_norm": 2.203474192981284, + "learning_rate": 3.7022487143898022e-06, + "loss": 0.386, + "step": 4250 + }, + { + "epoch": 0.5962131837307153, + "grad_norm": 2.017799092071447, + "learning_rate": 3.700055435239684e-06, + "loss": 0.3743, + "step": 4251 + }, + { + "epoch": 0.5963534361851333, + "grad_norm": 2.0964838655207134, + "learning_rate": 3.697862424324044e-06, + "loss": 0.3432, + "step": 4252 + }, + { + "epoch": 0.5964936886395512, + "grad_norm": 4.940246783744893, + "learning_rate": 3.695669682095397e-06, + "loss": 0.3148, + "step": 4253 + }, + { + "epoch": 0.5966339410939692, + "grad_norm": 1.8992584086286368, + "learning_rate": 3.6934772090061966e-06, + "loss": 0.3891, + "step": 4254 + }, + { + "epoch": 0.5967741935483871, + "grad_norm": 3.6758545433463947, + "learning_rate": 3.691285005508847e-06, + "loss": 0.3574, + "step": 4255 + }, + { + "epoch": 0.5969144460028051, + "grad_norm": 1.8013059302706191, + "learning_rate": 3.689093072055692e-06, + "loss": 0.343, + "step": 4256 + }, + { + "epoch": 0.597054698457223, + "grad_norm": 2.0326691585027965, + "learning_rate": 3.686901409099023e-06, + "loss": 0.3338, + "step": 4257 + }, + { + "epoch": 0.597194950911641, + "grad_norm": 2.0916453929087773, + "learning_rate": 3.6847100170910754e-06, + "loss": 0.3508, + "step": 4258 + }, + { + "epoch": 0.5973352033660589, + "grad_norm": 2.7867650418811993, + "learning_rate": 3.682518896484026e-06, + "loss": 0.3417, + "step": 4259 + }, + { + "epoch": 0.5974754558204769, + "grad_norm": 1.9317979893164554, + "learning_rate": 3.6803280477299975e-06, + "loss": 0.2917, + "step": 4260 + }, + { + "epoch": 0.5976157082748949, + "grad_norm": 1.83661720193116, + "learning_rate": 3.6781374712810558e-06, + "loss": 0.3435, + "step": 4261 + }, + { + "epoch": 0.5977559607293128, + "grad_norm": 2.4077858915350214, + "learning_rate": 3.675947167589212e-06, + "loss": 0.3558, + "step": 4262 + }, + { + "epoch": 0.5978962131837308, + "grad_norm": 1.9965214280474397, + "learning_rate": 3.6737571371064205e-06, + "loss": 0.4022, + "step": 4263 + }, + { + "epoch": 0.5980364656381487, + "grad_norm": 2.1240814692144245, + "learning_rate": 3.6715673802845768e-06, + "loss": 0.3042, + "step": 4264 + }, + { + "epoch": 0.5981767180925667, + "grad_norm": 2.241403156755719, + "learning_rate": 3.6693778975755235e-06, + "loss": 0.3563, + "step": 4265 + }, + { + "epoch": 0.5983169705469845, + "grad_norm": 2.4444567855850954, + "learning_rate": 3.667188689431046e-06, + "loss": 0.379, + "step": 4266 + }, + { + "epoch": 0.5984572230014025, + "grad_norm": 3.785584542655182, + "learning_rate": 3.664999756302869e-06, + "loss": 0.3515, + "step": 4267 + }, + { + "epoch": 0.5985974754558204, + "grad_norm": 2.0121158541243847, + "learning_rate": 3.662811098642665e-06, + "loss": 0.362, + "step": 4268 + }, + { + "epoch": 0.5987377279102384, + "grad_norm": 2.9032270234970223, + "learning_rate": 3.66062271690205e-06, + "loss": 0.3429, + "step": 4269 + }, + { + "epoch": 0.5988779803646563, + "grad_norm": 2.3263629159062664, + "learning_rate": 3.658434611532578e-06, + "loss": 0.342, + "step": 4270 + }, + { + "epoch": 0.5990182328190743, + "grad_norm": 2.053752730049088, + "learning_rate": 3.656246782985751e-06, + "loss": 0.3757, + "step": 4271 + }, + { + "epoch": 0.5991584852734922, + "grad_norm": 1.9144579420513643, + "learning_rate": 3.654059231713013e-06, + "loss": 0.3498, + "step": 4272 + }, + { + "epoch": 0.5992987377279102, + "grad_norm": 1.8557776906659542, + "learning_rate": 3.651871958165748e-06, + "loss": 0.3788, + "step": 4273 + }, + { + "epoch": 0.5994389901823282, + "grad_norm": 2.5721592167863605, + "learning_rate": 3.6496849627952875e-06, + "loss": 0.3271, + "step": 4274 + }, + { + "epoch": 0.5995792426367461, + "grad_norm": 2.0748444096704746, + "learning_rate": 3.6474982460528998e-06, + "loss": 0.3515, + "step": 4275 + }, + { + "epoch": 0.5997194950911641, + "grad_norm": 2.7852095803482406, + "learning_rate": 3.6453118083897988e-06, + "loss": 0.3391, + "step": 4276 + }, + { + "epoch": 0.599859747545582, + "grad_norm": 1.948589154469682, + "learning_rate": 3.6431256502571422e-06, + "loss": 0.3513, + "step": 4277 + }, + { + "epoch": 0.6, + "grad_norm": 1.7279774663215346, + "learning_rate": 3.640939772106029e-06, + "loss": 0.3441, + "step": 4278 + }, + { + "epoch": 0.6001402524544179, + "grad_norm": 2.2718346047778355, + "learning_rate": 3.638754174387498e-06, + "loss": 0.3577, + "step": 4279 + }, + { + "epoch": 0.6002805049088359, + "grad_norm": 2.1540052366981146, + "learning_rate": 3.6365688575525315e-06, + "loss": 0.3495, + "step": 4280 + }, + { + "epoch": 0.6004207573632538, + "grad_norm": 2.2821707761554757, + "learning_rate": 3.634383822052057e-06, + "loss": 0.3826, + "step": 4281 + }, + { + "epoch": 0.6005610098176718, + "grad_norm": 1.888680585271086, + "learning_rate": 3.6321990683369384e-06, + "loss": 0.3532, + "step": 4282 + }, + { + "epoch": 0.6007012622720898, + "grad_norm": 3.642471434638938, + "learning_rate": 3.6300145968579876e-06, + "loss": 0.3721, + "step": 4283 + }, + { + "epoch": 0.6008415147265077, + "grad_norm": 1.529307320316739, + "learning_rate": 3.627830408065952e-06, + "loss": 0.343, + "step": 4284 + }, + { + "epoch": 0.6009817671809257, + "grad_norm": 1.85106565352229, + "learning_rate": 3.625646502411525e-06, + "loss": 0.3351, + "step": 4285 + }, + { + "epoch": 0.6011220196353436, + "grad_norm": 1.6993304336501163, + "learning_rate": 3.623462880345341e-06, + "loss": 0.3167, + "step": 4286 + }, + { + "epoch": 0.6012622720897616, + "grad_norm": 2.039517640512858, + "learning_rate": 3.6212795423179754e-06, + "loss": 0.3277, + "step": 4287 + }, + { + "epoch": 0.6014025245441795, + "grad_norm": 1.6008608886211184, + "learning_rate": 3.6190964887799418e-06, + "loss": 0.4017, + "step": 4288 + }, + { + "epoch": 0.6015427769985975, + "grad_norm": 2.0974937106050717, + "learning_rate": 3.6169137201817007e-06, + "loss": 0.3473, + "step": 4289 + }, + { + "epoch": 0.6016830294530154, + "grad_norm": 1.9228467267984521, + "learning_rate": 3.614731236973651e-06, + "loss": 0.363, + "step": 4290 + }, + { + "epoch": 0.6018232819074334, + "grad_norm": 2.0389393736398005, + "learning_rate": 3.6125490396061315e-06, + "loss": 0.38, + "step": 4291 + }, + { + "epoch": 0.6019635343618513, + "grad_norm": 3.0728297506853153, + "learning_rate": 3.610367128529424e-06, + "loss": 0.3738, + "step": 4292 + }, + { + "epoch": 0.6021037868162693, + "grad_norm": 2.132214361024675, + "learning_rate": 3.6081855041937507e-06, + "loss": 0.3394, + "step": 4293 + }, + { + "epoch": 0.6022440392706873, + "grad_norm": 1.4584039761826493, + "learning_rate": 3.606004167049275e-06, + "loss": 0.3151, + "step": 4294 + }, + { + "epoch": 0.6023842917251052, + "grad_norm": 2.0255274050517613, + "learning_rate": 3.6038231175461004e-06, + "loss": 0.3578, + "step": 4295 + }, + { + "epoch": 0.6025245441795232, + "grad_norm": 2.123655679369667, + "learning_rate": 3.6016423561342707e-06, + "loss": 0.384, + "step": 4296 + }, + { + "epoch": 0.6026647966339411, + "grad_norm": 2.101620471254049, + "learning_rate": 3.5994618832637706e-06, + "loss": 0.3924, + "step": 4297 + }, + { + "epoch": 0.6028050490883591, + "grad_norm": 1.5563425427233322, + "learning_rate": 3.597281699384526e-06, + "loss": 0.3246, + "step": 4298 + }, + { + "epoch": 0.602945301542777, + "grad_norm": 1.8189422296092204, + "learning_rate": 3.595101804946404e-06, + "loss": 0.3488, + "step": 4299 + }, + { + "epoch": 0.603085553997195, + "grad_norm": 2.364861179334206, + "learning_rate": 3.5929222003992083e-06, + "loss": 0.3487, + "step": 4300 + }, + { + "epoch": 0.603225806451613, + "grad_norm": 4.028720268970686, + "learning_rate": 3.5907428861926857e-06, + "loss": 0.3425, + "step": 4301 + }, + { + "epoch": 0.6033660589060309, + "grad_norm": 1.988516683073515, + "learning_rate": 3.5885638627765228e-06, + "loss": 0.3121, + "step": 4302 + }, + { + "epoch": 0.6035063113604489, + "grad_norm": 3.8848709391362175, + "learning_rate": 3.586385130600345e-06, + "loss": 0.365, + "step": 4303 + }, + { + "epoch": 0.6036465638148668, + "grad_norm": 2.1905162180230757, + "learning_rate": 3.584206690113721e-06, + "loss": 0.3611, + "step": 4304 + }, + { + "epoch": 0.6037868162692848, + "grad_norm": 2.459869152869124, + "learning_rate": 3.582028541766154e-06, + "loss": 0.3443, + "step": 4305 + }, + { + "epoch": 0.6039270687237027, + "grad_norm": 2.653502176206626, + "learning_rate": 3.5798506860070904e-06, + "loss": 0.3323, + "step": 4306 + }, + { + "epoch": 0.6040673211781206, + "grad_norm": 1.744194072660565, + "learning_rate": 3.5776731232859156e-06, + "loss": 0.3904, + "step": 4307 + }, + { + "epoch": 0.6042075736325385, + "grad_norm": 2.240101871047314, + "learning_rate": 3.575495854051957e-06, + "loss": 0.3482, + "step": 4308 + }, + { + "epoch": 0.6043478260869565, + "grad_norm": 2.3815332199236896, + "learning_rate": 3.573318878754475e-06, + "loss": 0.3602, + "step": 4309 + }, + { + "epoch": 0.6044880785413744, + "grad_norm": 2.3758423967655764, + "learning_rate": 3.5711421978426746e-06, + "loss": 0.3952, + "step": 4310 + }, + { + "epoch": 0.6046283309957924, + "grad_norm": 1.8251077907553779, + "learning_rate": 3.568965811765699e-06, + "loss": 0.3419, + "step": 4311 + }, + { + "epoch": 0.6047685834502103, + "grad_norm": 1.9703723943744282, + "learning_rate": 3.5667897209726287e-06, + "loss": 0.3436, + "step": 4312 + }, + { + "epoch": 0.6049088359046283, + "grad_norm": 1.9864605134639635, + "learning_rate": 3.564613925912488e-06, + "loss": 0.3852, + "step": 4313 + }, + { + "epoch": 0.6050490883590462, + "grad_norm": 2.0148545796287087, + "learning_rate": 3.562438427034234e-06, + "loss": 0.3714, + "step": 4314 + }, + { + "epoch": 0.6051893408134642, + "grad_norm": 1.9747675408804797, + "learning_rate": 3.5602632247867687e-06, + "loss": 0.3359, + "step": 4315 + }, + { + "epoch": 0.6053295932678822, + "grad_norm": 4.8990725359649945, + "learning_rate": 3.5580883196189265e-06, + "loss": 0.3364, + "step": 4316 + }, + { + "epoch": 0.6054698457223001, + "grad_norm": 3.1115846689784417, + "learning_rate": 3.555913711979486e-06, + "loss": 0.3918, + "step": 4317 + }, + { + "epoch": 0.6056100981767181, + "grad_norm": 1.9817920605144639, + "learning_rate": 3.553739402317162e-06, + "loss": 0.3725, + "step": 4318 + }, + { + "epoch": 0.605750350631136, + "grad_norm": 2.0237731655178086, + "learning_rate": 3.551565391080609e-06, + "loss": 0.3597, + "step": 4319 + }, + { + "epoch": 0.605890603085554, + "grad_norm": 1.9535262039102557, + "learning_rate": 3.549391678718417e-06, + "loss": 0.3575, + "step": 4320 + }, + { + "epoch": 0.6060308555399719, + "grad_norm": 1.8527534614173429, + "learning_rate": 3.5472182656791165e-06, + "loss": 0.3103, + "step": 4321 + }, + { + "epoch": 0.6061711079943899, + "grad_norm": 2.2799961354406966, + "learning_rate": 3.545045152411178e-06, + "loss": 0.3506, + "step": 4322 + }, + { + "epoch": 0.6063113604488078, + "grad_norm": 1.8539091924426185, + "learning_rate": 3.5428723393630067e-06, + "loss": 0.3227, + "step": 4323 + }, + { + "epoch": 0.6064516129032258, + "grad_norm": 1.6580855077360623, + "learning_rate": 3.5406998269829485e-06, + "loss": 0.3362, + "step": 4324 + }, + { + "epoch": 0.6065918653576438, + "grad_norm": 1.9477421551491219, + "learning_rate": 3.538527615719285e-06, + "loss": 0.3435, + "step": 4325 + }, + { + "epoch": 0.6067321178120617, + "grad_norm": 2.2625670744505983, + "learning_rate": 3.5363557060202375e-06, + "loss": 0.3489, + "step": 4326 + }, + { + "epoch": 0.6068723702664797, + "grad_norm": 2.1946981527511826, + "learning_rate": 3.5341840983339636e-06, + "loss": 0.36, + "step": 4327 + }, + { + "epoch": 0.6070126227208976, + "grad_norm": 2.067955200934907, + "learning_rate": 3.532012793108561e-06, + "loss": 0.3321, + "step": 4328 + }, + { + "epoch": 0.6071528751753156, + "grad_norm": 1.7949605279689447, + "learning_rate": 3.5298417907920633e-06, + "loss": 0.324, + "step": 4329 + }, + { + "epoch": 0.6072931276297335, + "grad_norm": 1.772084799965578, + "learning_rate": 3.52767109183244e-06, + "loss": 0.369, + "step": 4330 + }, + { + "epoch": 0.6074333800841515, + "grad_norm": 2.156329830737455, + "learning_rate": 3.5255006966776005e-06, + "loss": 0.4056, + "step": 4331 + }, + { + "epoch": 0.6075736325385694, + "grad_norm": 2.6108693851920672, + "learning_rate": 3.523330605775389e-06, + "loss": 0.3627, + "step": 4332 + }, + { + "epoch": 0.6077138849929874, + "grad_norm": 1.8027850275166566, + "learning_rate": 3.5211608195735914e-06, + "loss": 0.3219, + "step": 4333 + }, + { + "epoch": 0.6078541374474054, + "grad_norm": 2.4882090032560122, + "learning_rate": 3.518991338519926e-06, + "loss": 0.3736, + "step": 4334 + }, + { + "epoch": 0.6079943899018233, + "grad_norm": 2.132108691001789, + "learning_rate": 3.516822163062052e-06, + "loss": 0.3451, + "step": 4335 + }, + { + "epoch": 0.6081346423562413, + "grad_norm": 4.92783075299627, + "learning_rate": 3.514653293647561e-06, + "loss": 0.3489, + "step": 4336 + }, + { + "epoch": 0.6082748948106592, + "grad_norm": 1.9123994414377583, + "learning_rate": 3.5124847307239863e-06, + "loss": 0.3244, + "step": 4337 + }, + { + "epoch": 0.6084151472650772, + "grad_norm": 4.693474789207228, + "learning_rate": 3.510316474738794e-06, + "loss": 0.3431, + "step": 4338 + }, + { + "epoch": 0.6085553997194951, + "grad_norm": 2.2118403340361263, + "learning_rate": 3.5081485261393894e-06, + "loss": 0.3909, + "step": 4339 + }, + { + "epoch": 0.6086956521739131, + "grad_norm": 2.076815314363698, + "learning_rate": 3.5059808853731146e-06, + "loss": 0.3753, + "step": 4340 + }, + { + "epoch": 0.608835904628331, + "grad_norm": 1.9349908886677722, + "learning_rate": 3.5038135528872453e-06, + "loss": 0.3487, + "step": 4341 + }, + { + "epoch": 0.608976157082749, + "grad_norm": 1.9679945449986103, + "learning_rate": 3.5016465291289957e-06, + "loss": 0.3382, + "step": 4342 + }, + { + "epoch": 0.609116409537167, + "grad_norm": 1.6875717006568307, + "learning_rate": 3.4994798145455167e-06, + "loss": 0.3484, + "step": 4343 + }, + { + "epoch": 0.6092566619915849, + "grad_norm": 1.7962124473771668, + "learning_rate": 3.4973134095838943e-06, + "loss": 0.3442, + "step": 4344 + }, + { + "epoch": 0.6093969144460029, + "grad_norm": 2.0884620733783263, + "learning_rate": 3.495147314691153e-06, + "loss": 0.273, + "step": 4345 + }, + { + "epoch": 0.6095371669004208, + "grad_norm": 1.8186427632077298, + "learning_rate": 3.4929815303142483e-06, + "loss": 0.3614, + "step": 4346 + }, + { + "epoch": 0.6096774193548387, + "grad_norm": 1.8281769052293504, + "learning_rate": 3.490816056900076e-06, + "loss": 0.3226, + "step": 4347 + }, + { + "epoch": 0.6098176718092566, + "grad_norm": 2.223116701296679, + "learning_rate": 3.4886508948954656e-06, + "loss": 0.3886, + "step": 4348 + }, + { + "epoch": 0.6099579242636746, + "grad_norm": 2.728598110778955, + "learning_rate": 3.486486044747186e-06, + "loss": 0.376, + "step": 4349 + }, + { + "epoch": 0.6100981767180925, + "grad_norm": 1.7482228110659543, + "learning_rate": 3.484321506901936e-06, + "loss": 0.3132, + "step": 4350 + }, + { + "epoch": 0.6102384291725105, + "grad_norm": 2.003599980021922, + "learning_rate": 3.4821572818063544e-06, + "loss": 0.4345, + "step": 4351 + }, + { + "epoch": 0.6103786816269284, + "grad_norm": 1.7644904888758262, + "learning_rate": 3.4799933699070115e-06, + "loss": 0.3953, + "step": 4352 + }, + { + "epoch": 0.6105189340813464, + "grad_norm": 3.39412399676018, + "learning_rate": 3.477829771650417e-06, + "loss": 0.367, + "step": 4353 + }, + { + "epoch": 0.6106591865357643, + "grad_norm": 2.066529585793821, + "learning_rate": 3.4756664874830147e-06, + "loss": 0.3257, + "step": 4354 + }, + { + "epoch": 0.6107994389901823, + "grad_norm": 2.2240134119960433, + "learning_rate": 3.4735035178511832e-06, + "loss": 0.3221, + "step": 4355 + }, + { + "epoch": 0.6109396914446003, + "grad_norm": 2.079361855844014, + "learning_rate": 3.471340863201237e-06, + "loss": 0.3717, + "step": 4356 + }, + { + "epoch": 0.6110799438990182, + "grad_norm": 2.5928811146132063, + "learning_rate": 3.469178523979422e-06, + "loss": 0.3454, + "step": 4357 + }, + { + "epoch": 0.6112201963534362, + "grad_norm": 1.9893070115609928, + "learning_rate": 3.4670165006319236e-06, + "loss": 0.3507, + "step": 4358 + }, + { + "epoch": 0.6113604488078541, + "grad_norm": 2.8649412914072863, + "learning_rate": 3.4648547936048597e-06, + "loss": 0.3581, + "step": 4359 + }, + { + "epoch": 0.6115007012622721, + "grad_norm": 2.324811382231182, + "learning_rate": 3.4626934033442856e-06, + "loss": 0.3689, + "step": 4360 + }, + { + "epoch": 0.61164095371669, + "grad_norm": 2.089206491442504, + "learning_rate": 3.4605323302961857e-06, + "loss": 0.3527, + "step": 4361 + }, + { + "epoch": 0.611781206171108, + "grad_norm": 2.074881065361166, + "learning_rate": 3.458371574906484e-06, + "loss": 0.355, + "step": 4362 + }, + { + "epoch": 0.6119214586255259, + "grad_norm": 3.7156631095606607, + "learning_rate": 3.456211137621037e-06, + "loss": 0.3419, + "step": 4363 + }, + { + "epoch": 0.6120617110799439, + "grad_norm": 2.3329962645891995, + "learning_rate": 3.4540510188856357e-06, + "loss": 0.333, + "step": 4364 + }, + { + "epoch": 0.6122019635343618, + "grad_norm": 2.4543838672560327, + "learning_rate": 3.4518912191460073e-06, + "loss": 0.3611, + "step": 4365 + }, + { + "epoch": 0.6123422159887798, + "grad_norm": 2.2936147790284305, + "learning_rate": 3.449731738847809e-06, + "loss": 0.331, + "step": 4366 + }, + { + "epoch": 0.6124824684431978, + "grad_norm": 1.7890824895683208, + "learning_rate": 3.447572578436635e-06, + "loss": 0.3713, + "step": 4367 + }, + { + "epoch": 0.6126227208976157, + "grad_norm": 2.198428077921574, + "learning_rate": 3.4454137383580135e-06, + "loss": 0.3458, + "step": 4368 + }, + { + "epoch": 0.6127629733520337, + "grad_norm": 1.7246301116215097, + "learning_rate": 3.4432552190574055e-06, + "loss": 0.3272, + "step": 4369 + }, + { + "epoch": 0.6129032258064516, + "grad_norm": 2.1581439118008254, + "learning_rate": 3.4410970209802096e-06, + "loss": 0.3944, + "step": 4370 + }, + { + "epoch": 0.6130434782608696, + "grad_norm": 2.3616853227688352, + "learning_rate": 3.438939144571749e-06, + "loss": 0.3519, + "step": 4371 + }, + { + "epoch": 0.6131837307152875, + "grad_norm": 1.915360178230315, + "learning_rate": 3.4367815902772917e-06, + "loss": 0.3695, + "step": 4372 + }, + { + "epoch": 0.6133239831697055, + "grad_norm": 1.9699757711014638, + "learning_rate": 3.4346243585420297e-06, + "loss": 0.3719, + "step": 4373 + }, + { + "epoch": 0.6134642356241234, + "grad_norm": 1.9591654639630485, + "learning_rate": 3.4324674498110956e-06, + "loss": 0.3355, + "step": 4374 + }, + { + "epoch": 0.6136044880785414, + "grad_norm": 1.9766826366960935, + "learning_rate": 3.43031086452955e-06, + "loss": 0.3756, + "step": 4375 + }, + { + "epoch": 0.6137447405329594, + "grad_norm": 2.09786810062667, + "learning_rate": 3.4281546031423933e-06, + "loss": 0.3516, + "step": 4376 + }, + { + "epoch": 0.6138849929873773, + "grad_norm": 1.7469417552442383, + "learning_rate": 3.425998666094551e-06, + "loss": 0.3842, + "step": 4377 + }, + { + "epoch": 0.6140252454417953, + "grad_norm": 2.38398558710209, + "learning_rate": 3.4238430538308876e-06, + "loss": 0.3571, + "step": 4378 + }, + { + "epoch": 0.6141654978962132, + "grad_norm": 2.216296599146189, + "learning_rate": 3.4216877667961975e-06, + "loss": 0.3246, + "step": 4379 + }, + { + "epoch": 0.6143057503506312, + "grad_norm": 1.6360010289606342, + "learning_rate": 3.4195328054352097e-06, + "loss": 0.3561, + "step": 4380 + }, + { + "epoch": 0.6144460028050491, + "grad_norm": 2.0961595022358095, + "learning_rate": 3.417378170192587e-06, + "loss": 0.2913, + "step": 4381 + }, + { + "epoch": 0.6145862552594671, + "grad_norm": 1.8339831399964226, + "learning_rate": 3.4152238615129208e-06, + "loss": 0.3441, + "step": 4382 + }, + { + "epoch": 0.614726507713885, + "grad_norm": 2.756747723748065, + "learning_rate": 3.413069879840738e-06, + "loss": 0.3386, + "step": 4383 + }, + { + "epoch": 0.614866760168303, + "grad_norm": 2.056533970109622, + "learning_rate": 3.4109162256204988e-06, + "loss": 0.3845, + "step": 4384 + }, + { + "epoch": 0.615007012622721, + "grad_norm": 1.9620190559573933, + "learning_rate": 3.4087628992965937e-06, + "loss": 0.3466, + "step": 4385 + }, + { + "epoch": 0.6151472650771389, + "grad_norm": 2.395230314135045, + "learning_rate": 3.406609901313349e-06, + "loss": 0.433, + "step": 4386 + }, + { + "epoch": 0.6152875175315567, + "grad_norm": 4.720998266334918, + "learning_rate": 3.404457232115017e-06, + "loss": 0.3398, + "step": 4387 + }, + { + "epoch": 0.6154277699859747, + "grad_norm": 2.3935329727314674, + "learning_rate": 3.402304892145788e-06, + "loss": 0.3682, + "step": 4388 + }, + { + "epoch": 0.6155680224403927, + "grad_norm": 1.919283538596872, + "learning_rate": 3.4001528818497826e-06, + "loss": 0.362, + "step": 4389 + }, + { + "epoch": 0.6157082748948106, + "grad_norm": 2.507789302730524, + "learning_rate": 3.3980012016710533e-06, + "loss": 0.313, + "step": 4390 + }, + { + "epoch": 0.6158485273492286, + "grad_norm": 1.7911317677420253, + "learning_rate": 3.395849852053584e-06, + "loss": 0.3232, + "step": 4391 + }, + { + "epoch": 0.6159887798036465, + "grad_norm": 2.602774031518904, + "learning_rate": 3.3936988334412895e-06, + "loss": 0.3787, + "step": 4392 + }, + { + "epoch": 0.6161290322580645, + "grad_norm": 2.1162838291769814, + "learning_rate": 3.3915481462780174e-06, + "loss": 0.3558, + "step": 4393 + }, + { + "epoch": 0.6162692847124824, + "grad_norm": 2.020274396331996, + "learning_rate": 3.389397791007548e-06, + "loss": 0.3406, + "step": 4394 + }, + { + "epoch": 0.6164095371669004, + "grad_norm": 2.5636787494670137, + "learning_rate": 3.3872477680735915e-06, + "loss": 0.368, + "step": 4395 + }, + { + "epoch": 0.6165497896213183, + "grad_norm": 1.6640758245230183, + "learning_rate": 3.385098077919791e-06, + "loss": 0.3303, + "step": 4396 + }, + { + "epoch": 0.6166900420757363, + "grad_norm": 2.0837048644341767, + "learning_rate": 3.3829487209897207e-06, + "loss": 0.3385, + "step": 4397 + }, + { + "epoch": 0.6168302945301543, + "grad_norm": 2.166659452964316, + "learning_rate": 3.3807996977268825e-06, + "loss": 0.3254, + "step": 4398 + }, + { + "epoch": 0.6169705469845722, + "grad_norm": 2.14184413080648, + "learning_rate": 3.3786510085747145e-06, + "loss": 0.3696, + "step": 4399 + }, + { + "epoch": 0.6171107994389902, + "grad_norm": 1.788784755153217, + "learning_rate": 3.3765026539765832e-06, + "loss": 0.3462, + "step": 4400 + }, + { + "epoch": 0.6172510518934081, + "grad_norm": 2.244079642135408, + "learning_rate": 3.3743546343757872e-06, + "loss": 0.3661, + "step": 4401 + }, + { + "epoch": 0.6173913043478261, + "grad_norm": 2.5572643291516837, + "learning_rate": 3.3722069502155543e-06, + "loss": 0.3353, + "step": 4402 + }, + { + "epoch": 0.617531556802244, + "grad_norm": 1.7719158753216584, + "learning_rate": 3.370059601939044e-06, + "loss": 0.3331, + "step": 4403 + }, + { + "epoch": 0.617671809256662, + "grad_norm": 2.1734355561763974, + "learning_rate": 3.3679125899893474e-06, + "loss": 0.3341, + "step": 4404 + }, + { + "epoch": 0.6178120617110799, + "grad_norm": 1.8119890216216927, + "learning_rate": 3.3657659148094855e-06, + "loss": 0.3478, + "step": 4405 + }, + { + "epoch": 0.6179523141654979, + "grad_norm": 2.248818405154216, + "learning_rate": 3.36361957684241e-06, + "loss": 0.3737, + "step": 4406 + }, + { + "epoch": 0.6180925666199159, + "grad_norm": 1.8517112833113465, + "learning_rate": 3.3614735765310013e-06, + "loss": 0.3739, + "step": 4407 + }, + { + "epoch": 0.6182328190743338, + "grad_norm": 1.9878941367072047, + "learning_rate": 3.3593279143180723e-06, + "loss": 0.4018, + "step": 4408 + }, + { + "epoch": 0.6183730715287518, + "grad_norm": 2.1501275091285805, + "learning_rate": 3.357182590646366e-06, + "loss": 0.3807, + "step": 4409 + }, + { + "epoch": 0.6185133239831697, + "grad_norm": 2.0735349515825794, + "learning_rate": 3.355037605958554e-06, + "loss": 0.3493, + "step": 4410 + }, + { + "epoch": 0.6186535764375877, + "grad_norm": 1.8811474160648354, + "learning_rate": 3.3528929606972407e-06, + "loss": 0.3096, + "step": 4411 + }, + { + "epoch": 0.6187938288920056, + "grad_norm": 2.3151646289003147, + "learning_rate": 3.3507486553049572e-06, + "loss": 0.3784, + "step": 4412 + }, + { + "epoch": 0.6189340813464236, + "grad_norm": 3.573511072359696, + "learning_rate": 3.3486046902241663e-06, + "loss": 0.3747, + "step": 4413 + }, + { + "epoch": 0.6190743338008415, + "grad_norm": 2.037821190915984, + "learning_rate": 3.3464610658972584e-06, + "loss": 0.3391, + "step": 4414 + }, + { + "epoch": 0.6192145862552595, + "grad_norm": 1.9200223351212615, + "learning_rate": 3.344317782766558e-06, + "loss": 0.3862, + "step": 4415 + }, + { + "epoch": 0.6193548387096774, + "grad_norm": 2.054567050666888, + "learning_rate": 3.342174841274315e-06, + "loss": 0.3239, + "step": 4416 + }, + { + "epoch": 0.6194950911640954, + "grad_norm": 3.476778790892976, + "learning_rate": 3.3400322418627117e-06, + "loss": 0.3166, + "step": 4417 + }, + { + "epoch": 0.6196353436185134, + "grad_norm": 2.1363802960851173, + "learning_rate": 3.337889984973858e-06, + "loss": 0.308, + "step": 4418 + }, + { + "epoch": 0.6197755960729313, + "grad_norm": 2.1849234529612915, + "learning_rate": 3.3357480710497925e-06, + "loss": 0.3627, + "step": 4419 + }, + { + "epoch": 0.6199158485273493, + "grad_norm": 2.1862200670935885, + "learning_rate": 3.3336065005324847e-06, + "loss": 0.3147, + "step": 4420 + }, + { + "epoch": 0.6200561009817672, + "grad_norm": 1.937681558777636, + "learning_rate": 3.331465273863834e-06, + "loss": 0.3547, + "step": 4421 + }, + { + "epoch": 0.6201963534361852, + "grad_norm": 2.0081232327721494, + "learning_rate": 3.3293243914856676e-06, + "loss": 0.3449, + "step": 4422 + }, + { + "epoch": 0.6203366058906031, + "grad_norm": 1.85053936338573, + "learning_rate": 3.32718385383974e-06, + "loss": 0.3721, + "step": 4423 + }, + { + "epoch": 0.6204768583450211, + "grad_norm": 5.283646260008643, + "learning_rate": 3.3250436613677366e-06, + "loss": 0.3561, + "step": 4424 + }, + { + "epoch": 0.620617110799439, + "grad_norm": 1.7050215634187804, + "learning_rate": 3.3229038145112713e-06, + "loss": 0.3319, + "step": 4425 + }, + { + "epoch": 0.620757363253857, + "grad_norm": 2.1540374461313196, + "learning_rate": 3.3207643137118872e-06, + "loss": 0.3501, + "step": 4426 + }, + { + "epoch": 0.6208976157082748, + "grad_norm": 2.086014535460115, + "learning_rate": 3.318625159411056e-06, + "loss": 0.3396, + "step": 4427 + }, + { + "epoch": 0.6210378681626928, + "grad_norm": 1.6800643143377814, + "learning_rate": 3.3164863520501744e-06, + "loss": 0.3346, + "step": 4428 + }, + { + "epoch": 0.6211781206171108, + "grad_norm": 2.128774544377825, + "learning_rate": 3.314347892070573e-06, + "loss": 0.3444, + "step": 4429 + }, + { + "epoch": 0.6213183730715287, + "grad_norm": 2.2955677698127803, + "learning_rate": 3.3122097799135066e-06, + "loss": 0.3269, + "step": 4430 + }, + { + "epoch": 0.6214586255259467, + "grad_norm": 1.8260493436290384, + "learning_rate": 3.3100720160201615e-06, + "loss": 0.3489, + "step": 4431 + }, + { + "epoch": 0.6215988779803646, + "grad_norm": 2.193172837543114, + "learning_rate": 3.307934600831648e-06, + "loss": 0.3836, + "step": 4432 + }, + { + "epoch": 0.6217391304347826, + "grad_norm": 2.0765223238741863, + "learning_rate": 3.30579753478901e-06, + "loss": 0.366, + "step": 4433 + }, + { + "epoch": 0.6218793828892005, + "grad_norm": 1.8817112024655251, + "learning_rate": 3.303660818333212e-06, + "loss": 0.3176, + "step": 4434 + }, + { + "epoch": 0.6220196353436185, + "grad_norm": 2.409517225132294, + "learning_rate": 3.3015244519051525e-06, + "loss": 0.4114, + "step": 4435 + }, + { + "epoch": 0.6221598877980364, + "grad_norm": 1.944579143248559, + "learning_rate": 3.2993884359456557e-06, + "loss": 0.3663, + "step": 4436 + }, + { + "epoch": 0.6223001402524544, + "grad_norm": 2.218416108273978, + "learning_rate": 3.2972527708954737e-06, + "loss": 0.3758, + "step": 4437 + }, + { + "epoch": 0.6224403927068723, + "grad_norm": 2.124198762519748, + "learning_rate": 3.295117457195288e-06, + "loss": 0.3691, + "step": 4438 + }, + { + "epoch": 0.6225806451612903, + "grad_norm": 1.8243388559146436, + "learning_rate": 3.2929824952857014e-06, + "loss": 0.386, + "step": 4439 + }, + { + "epoch": 0.6227208976157083, + "grad_norm": 2.110256886686457, + "learning_rate": 3.2908478856072518e-06, + "loss": 0.3775, + "step": 4440 + }, + { + "epoch": 0.6228611500701262, + "grad_norm": 1.850135671950415, + "learning_rate": 3.2887136286003997e-06, + "loss": 0.3078, + "step": 4441 + }, + { + "epoch": 0.6230014025245442, + "grad_norm": 2.3813354917383327, + "learning_rate": 3.2865797247055354e-06, + "loss": 0.3784, + "step": 4442 + }, + { + "epoch": 0.6231416549789621, + "grad_norm": 1.7585870689288905, + "learning_rate": 3.2844461743629725e-06, + "loss": 0.3873, + "step": 4443 + }, + { + "epoch": 0.6232819074333801, + "grad_norm": 2.2404897821930203, + "learning_rate": 3.282312978012956e-06, + "loss": 0.3758, + "step": 4444 + }, + { + "epoch": 0.623422159887798, + "grad_norm": 2.061052103160456, + "learning_rate": 3.2801801360956557e-06, + "loss": 0.3572, + "step": 4445 + }, + { + "epoch": 0.623562412342216, + "grad_norm": 1.8882111832699573, + "learning_rate": 3.2780476490511694e-06, + "loss": 0.3954, + "step": 4446 + }, + { + "epoch": 0.6237026647966339, + "grad_norm": 2.0889731046370543, + "learning_rate": 3.27591551731952e-06, + "loss": 0.3562, + "step": 4447 + }, + { + "epoch": 0.6238429172510519, + "grad_norm": 2.263854920406114, + "learning_rate": 3.273783741340658e-06, + "loss": 0.3687, + "step": 4448 + }, + { + "epoch": 0.6239831697054699, + "grad_norm": 2.3011147948687434, + "learning_rate": 3.2716523215544602e-06, + "loss": 0.3351, + "step": 4449 + }, + { + "epoch": 0.6241234221598878, + "grad_norm": 1.9925843685002225, + "learning_rate": 3.269521258400731e-06, + "loss": 0.3504, + "step": 4450 + }, + { + "epoch": 0.6242636746143058, + "grad_norm": 2.0951980593704027, + "learning_rate": 3.2673905523192e-06, + "loss": 0.3716, + "step": 4451 + }, + { + "epoch": 0.6244039270687237, + "grad_norm": 3.050645422262411, + "learning_rate": 3.2652602037495247e-06, + "loss": 0.3699, + "step": 4452 + }, + { + "epoch": 0.6245441795231417, + "grad_norm": 3.1397779850314396, + "learning_rate": 3.2631302131312854e-06, + "loss": 0.3832, + "step": 4453 + }, + { + "epoch": 0.6246844319775596, + "grad_norm": 1.8468113070121392, + "learning_rate": 3.2610005809039936e-06, + "loss": 0.3768, + "step": 4454 + }, + { + "epoch": 0.6248246844319776, + "grad_norm": 1.8401996891880095, + "learning_rate": 3.258871307507081e-06, + "loss": 0.3362, + "step": 4455 + }, + { + "epoch": 0.6249649368863955, + "grad_norm": 2.162123158866688, + "learning_rate": 3.256742393379909e-06, + "loss": 0.3345, + "step": 4456 + }, + { + "epoch": 0.6251051893408135, + "grad_norm": 1.9774089704508973, + "learning_rate": 3.254613838961765e-06, + "loss": 0.311, + "step": 4457 + }, + { + "epoch": 0.6252454417952314, + "grad_norm": 1.6822473226869763, + "learning_rate": 3.252485644691862e-06, + "loss": 0.2772, + "step": 4458 + }, + { + "epoch": 0.6253856942496494, + "grad_norm": 3.0043092613574953, + "learning_rate": 3.2503578110093358e-06, + "loss": 0.3381, + "step": 4459 + }, + { + "epoch": 0.6255259467040674, + "grad_norm": 1.9601816491206723, + "learning_rate": 3.248230338353252e-06, + "loss": 0.3463, + "step": 4460 + }, + { + "epoch": 0.6256661991584853, + "grad_norm": 1.893437272238824, + "learning_rate": 3.2461032271625982e-06, + "loss": 0.3776, + "step": 4461 + }, + { + "epoch": 0.6258064516129033, + "grad_norm": 1.8747348597935514, + "learning_rate": 3.2439764778762906e-06, + "loss": 0.3542, + "step": 4462 + }, + { + "epoch": 0.6259467040673212, + "grad_norm": 2.12824685620318, + "learning_rate": 3.2418500909331684e-06, + "loss": 0.3022, + "step": 4463 + }, + { + "epoch": 0.6260869565217392, + "grad_norm": 1.7486535069291265, + "learning_rate": 3.2397240667719963e-06, + "loss": 0.3005, + "step": 4464 + }, + { + "epoch": 0.6262272089761571, + "grad_norm": 1.9781555872476282, + "learning_rate": 3.2375984058314647e-06, + "loss": 0.3385, + "step": 4465 + }, + { + "epoch": 0.6263674614305751, + "grad_norm": 2.805830103803709, + "learning_rate": 3.235473108550189e-06, + "loss": 0.3415, + "step": 4466 + }, + { + "epoch": 0.6265077138849929, + "grad_norm": 2.2315230942406568, + "learning_rate": 3.233348175366709e-06, + "loss": 0.4034, + "step": 4467 + }, + { + "epoch": 0.6266479663394109, + "grad_norm": 1.9608690470263896, + "learning_rate": 3.2312236067194913e-06, + "loss": 0.3344, + "step": 4468 + }, + { + "epoch": 0.6267882187938288, + "grad_norm": 1.8807564426987695, + "learning_rate": 3.2290994030469237e-06, + "loss": 0.3529, + "step": 4469 + }, + { + "epoch": 0.6269284712482468, + "grad_norm": 2.721551876399985, + "learning_rate": 3.226975564787322e-06, + "loss": 0.3569, + "step": 4470 + }, + { + "epoch": 0.6270687237026648, + "grad_norm": 6.783111176223187, + "learning_rate": 3.224852092378925e-06, + "loss": 0.3592, + "step": 4471 + }, + { + "epoch": 0.6272089761570827, + "grad_norm": 2.198620350600676, + "learning_rate": 3.2227289862598976e-06, + "loss": 0.3864, + "step": 4472 + }, + { + "epoch": 0.6273492286115007, + "grad_norm": 2.0113101877674753, + "learning_rate": 3.220606246868326e-06, + "loss": 0.3373, + "step": 4473 + }, + { + "epoch": 0.6274894810659186, + "grad_norm": 1.9854114859300918, + "learning_rate": 3.2184838746422233e-06, + "loss": 0.3461, + "step": 4474 + }, + { + "epoch": 0.6276297335203366, + "grad_norm": 2.0358610195921947, + "learning_rate": 3.2163618700195285e-06, + "loss": 0.3876, + "step": 4475 + }, + { + "epoch": 0.6277699859747545, + "grad_norm": 1.7221775211494363, + "learning_rate": 3.2142402334380984e-06, + "loss": 0.3518, + "step": 4476 + }, + { + "epoch": 0.6279102384291725, + "grad_norm": 1.9751394222639023, + "learning_rate": 3.21211896533572e-06, + "loss": 0.37, + "step": 4477 + }, + { + "epoch": 0.6280504908835904, + "grad_norm": 1.8196223837672574, + "learning_rate": 3.2099980661501016e-06, + "loss": 0.3333, + "step": 4478 + }, + { + "epoch": 0.6281907433380084, + "grad_norm": 1.7610174504487206, + "learning_rate": 3.2078775363188775e-06, + "loss": 0.3595, + "step": 4479 + }, + { + "epoch": 0.6283309957924264, + "grad_norm": 2.2911030394129006, + "learning_rate": 3.205757376279602e-06, + "loss": 0.3077, + "step": 4480 + }, + { + "epoch": 0.6284712482468443, + "grad_norm": 1.5870498739961734, + "learning_rate": 3.203637586469756e-06, + "loss": 0.3054, + "step": 4481 + }, + { + "epoch": 0.6286115007012623, + "grad_norm": 1.878691218834135, + "learning_rate": 3.2015181673267435e-06, + "loss": 0.3691, + "step": 4482 + }, + { + "epoch": 0.6287517531556802, + "grad_norm": 1.8826255230640434, + "learning_rate": 3.199399119287894e-06, + "loss": 0.3468, + "step": 4483 + }, + { + "epoch": 0.6288920056100982, + "grad_norm": 2.501134043021219, + "learning_rate": 3.197280442790455e-06, + "loss": 0.3495, + "step": 4484 + }, + { + "epoch": 0.6290322580645161, + "grad_norm": 1.8727212410010685, + "learning_rate": 3.1951621382716015e-06, + "loss": 0.3651, + "step": 4485 + }, + { + "epoch": 0.6291725105189341, + "grad_norm": 2.2809491939012343, + "learning_rate": 3.1930442061684306e-06, + "loss": 0.3981, + "step": 4486 + }, + { + "epoch": 0.629312762973352, + "grad_norm": 2.972648550417098, + "learning_rate": 3.1909266469179644e-06, + "loss": 0.3392, + "step": 4487 + }, + { + "epoch": 0.62945301542777, + "grad_norm": 1.940978158461384, + "learning_rate": 3.1888094609571463e-06, + "loss": 0.2972, + "step": 4488 + }, + { + "epoch": 0.629593267882188, + "grad_norm": 1.5125886055710485, + "learning_rate": 3.18669264872284e-06, + "loss": 0.3233, + "step": 4489 + }, + { + "epoch": 0.6297335203366059, + "grad_norm": 2.116794000635274, + "learning_rate": 3.1845762106518374e-06, + "loss": 0.308, + "step": 4490 + }, + { + "epoch": 0.6298737727910239, + "grad_norm": 1.9100346126715593, + "learning_rate": 3.1824601471808504e-06, + "loss": 0.321, + "step": 4491 + }, + { + "epoch": 0.6300140252454418, + "grad_norm": 1.8993772841474652, + "learning_rate": 3.180344458746514e-06, + "loss": 0.3533, + "step": 4492 + }, + { + "epoch": 0.6301542776998598, + "grad_norm": 1.843474674680991, + "learning_rate": 3.178229145785386e-06, + "loss": 0.3413, + "step": 4493 + }, + { + "epoch": 0.6302945301542777, + "grad_norm": 2.7419767380276525, + "learning_rate": 3.1761142087339446e-06, + "loss": 0.3849, + "step": 4494 + }, + { + "epoch": 0.6304347826086957, + "grad_norm": 2.6959122584806106, + "learning_rate": 3.1739996480285963e-06, + "loss": 0.344, + "step": 4495 + }, + { + "epoch": 0.6305750350631136, + "grad_norm": 1.8567263547501958, + "learning_rate": 3.171885464105661e-06, + "loss": 0.3539, + "step": 4496 + }, + { + "epoch": 0.6307152875175316, + "grad_norm": 2.214904461386831, + "learning_rate": 3.169771657401387e-06, + "loss": 0.3491, + "step": 4497 + }, + { + "epoch": 0.6308555399719495, + "grad_norm": 2.3015393496175847, + "learning_rate": 3.1676582283519454e-06, + "loss": 0.3171, + "step": 4498 + }, + { + "epoch": 0.6309957924263675, + "grad_norm": 2.2839814646848975, + "learning_rate": 3.165545177393427e-06, + "loss": 0.3529, + "step": 4499 + }, + { + "epoch": 0.6311360448807855, + "grad_norm": 2.0924276474780243, + "learning_rate": 3.1634325049618443e-06, + "loss": 0.3688, + "step": 4500 + }, + { + "epoch": 0.6312762973352034, + "grad_norm": 1.7791027042704783, + "learning_rate": 3.161320211493133e-06, + "loss": 0.316, + "step": 4501 + }, + { + "epoch": 0.6314165497896214, + "grad_norm": 2.182453437824107, + "learning_rate": 3.15920829742315e-06, + "loss": 0.3754, + "step": 4502 + }, + { + "epoch": 0.6315568022440393, + "grad_norm": 2.2554687161737963, + "learning_rate": 3.1570967631876733e-06, + "loss": 0.4206, + "step": 4503 + }, + { + "epoch": 0.6316970546984573, + "grad_norm": 2.0332195416904475, + "learning_rate": 3.154985609222405e-06, + "loss": 0.3274, + "step": 4504 + }, + { + "epoch": 0.6318373071528752, + "grad_norm": 1.6553149192811103, + "learning_rate": 3.1528748359629657e-06, + "loss": 0.3651, + "step": 4505 + }, + { + "epoch": 0.6319775596072932, + "grad_norm": 2.0617873376652684, + "learning_rate": 3.1507644438448987e-06, + "loss": 0.4181, + "step": 4506 + }, + { + "epoch": 0.632117812061711, + "grad_norm": 2.6060304508084267, + "learning_rate": 3.1486544333036687e-06, + "loss": 0.3579, + "step": 4507 + }, + { + "epoch": 0.632258064516129, + "grad_norm": 1.8247594910048301, + "learning_rate": 3.1465448047746626e-06, + "loss": 0.3772, + "step": 4508 + }, + { + "epoch": 0.6323983169705469, + "grad_norm": 1.9582347709797019, + "learning_rate": 3.1444355586931876e-06, + "loss": 0.3817, + "step": 4509 + }, + { + "epoch": 0.6325385694249649, + "grad_norm": 2.7868882367542005, + "learning_rate": 3.1423266954944694e-06, + "loss": 0.3441, + "step": 4510 + }, + { + "epoch": 0.6326788218793828, + "grad_norm": 3.0028583896515637, + "learning_rate": 3.1402182156136586e-06, + "loss": 0.3575, + "step": 4511 + }, + { + "epoch": 0.6328190743338008, + "grad_norm": 2.1012603656029767, + "learning_rate": 3.1381101194858264e-06, + "loss": 0.3496, + "step": 4512 + }, + { + "epoch": 0.6329593267882188, + "grad_norm": 4.551077143009234, + "learning_rate": 3.136002407545964e-06, + "loss": 0.3565, + "step": 4513 + }, + { + "epoch": 0.6330995792426367, + "grad_norm": 1.6054086789961206, + "learning_rate": 3.1338950802289802e-06, + "loss": 0.3479, + "step": 4514 + }, + { + "epoch": 0.6332398316970547, + "grad_norm": 2.6545066947055016, + "learning_rate": 3.131788137969709e-06, + "loss": 0.3499, + "step": 4515 + }, + { + "epoch": 0.6333800841514726, + "grad_norm": 2.0053540329617285, + "learning_rate": 3.1296815812029058e-06, + "loss": 0.338, + "step": 4516 + }, + { + "epoch": 0.6335203366058906, + "grad_norm": 1.8970723836599421, + "learning_rate": 3.1275754103632385e-06, + "loss": 0.3494, + "step": 4517 + }, + { + "epoch": 0.6336605890603085, + "grad_norm": 2.002465278429241, + "learning_rate": 3.1254696258853034e-06, + "loss": 0.2897, + "step": 4518 + }, + { + "epoch": 0.6338008415147265, + "grad_norm": 1.9478185972390356, + "learning_rate": 3.1233642282036147e-06, + "loss": 0.362, + "step": 4519 + }, + { + "epoch": 0.6339410939691444, + "grad_norm": 2.0847051173194684, + "learning_rate": 3.121259217752608e-06, + "loss": 0.4214, + "step": 4520 + }, + { + "epoch": 0.6340813464235624, + "grad_norm": 1.709321186118227, + "learning_rate": 3.119154594966634e-06, + "loss": 0.3926, + "step": 4521 + }, + { + "epoch": 0.6342215988779804, + "grad_norm": 3.474513034868019, + "learning_rate": 3.1170503602799695e-06, + "loss": 0.3445, + "step": 4522 + }, + { + "epoch": 0.6343618513323983, + "grad_norm": 1.7033494573950991, + "learning_rate": 3.114946514126807e-06, + "loss": 0.3354, + "step": 4523 + }, + { + "epoch": 0.6345021037868163, + "grad_norm": 1.723414939532117, + "learning_rate": 3.112843056941263e-06, + "loss": 0.355, + "step": 4524 + }, + { + "epoch": 0.6346423562412342, + "grad_norm": 2.4963643340783115, + "learning_rate": 3.1107399891573675e-06, + "loss": 0.3273, + "step": 4525 + }, + { + "epoch": 0.6347826086956522, + "grad_norm": 1.7502858745863457, + "learning_rate": 3.1086373112090762e-06, + "loss": 0.3301, + "step": 4526 + }, + { + "epoch": 0.6349228611500701, + "grad_norm": 2.12727357746543, + "learning_rate": 3.106535023530262e-06, + "loss": 0.3441, + "step": 4527 + }, + { + "epoch": 0.6350631136044881, + "grad_norm": 1.96508957403895, + "learning_rate": 3.1044331265547168e-06, + "loss": 0.3411, + "step": 4528 + }, + { + "epoch": 0.635203366058906, + "grad_norm": 1.8779178168704709, + "learning_rate": 3.1023316207161535e-06, + "loss": 0.3713, + "step": 4529 + }, + { + "epoch": 0.635343618513324, + "grad_norm": 2.896004449925244, + "learning_rate": 3.1002305064482006e-06, + "loss": 0.3896, + "step": 4530 + }, + { + "epoch": 0.635483870967742, + "grad_norm": 1.8324450450530976, + "learning_rate": 3.0981297841844106e-06, + "loss": 0.3034, + "step": 4531 + }, + { + "epoch": 0.6356241234221599, + "grad_norm": 1.9464898893464622, + "learning_rate": 3.0960294543582513e-06, + "loss": 0.32, + "step": 4532 + }, + { + "epoch": 0.6357643758765779, + "grad_norm": 2.455072686698432, + "learning_rate": 3.0939295174031127e-06, + "loss": 0.3627, + "step": 4533 + }, + { + "epoch": 0.6359046283309958, + "grad_norm": 2.055774473727265, + "learning_rate": 3.0918299737523016e-06, + "loss": 0.369, + "step": 4534 + }, + { + "epoch": 0.6360448807854138, + "grad_norm": 3.01562088882087, + "learning_rate": 3.0897308238390432e-06, + "loss": 0.3974, + "step": 4535 + }, + { + "epoch": 0.6361851332398317, + "grad_norm": 2.2624573761523665, + "learning_rate": 3.087632068096483e-06, + "loss": 0.3867, + "step": 4536 + }, + { + "epoch": 0.6363253856942497, + "grad_norm": 2.323906782995709, + "learning_rate": 3.0855337069576872e-06, + "loss": 0.3613, + "step": 4537 + }, + { + "epoch": 0.6364656381486676, + "grad_norm": 2.8973748657484584, + "learning_rate": 3.0834357408556333e-06, + "loss": 0.3726, + "step": 4538 + }, + { + "epoch": 0.6366058906030856, + "grad_norm": 2.0203821392994072, + "learning_rate": 3.0813381702232235e-06, + "loss": 0.3743, + "step": 4539 + }, + { + "epoch": 0.6367461430575035, + "grad_norm": 1.671030062781153, + "learning_rate": 3.079240995493279e-06, + "loss": 0.347, + "step": 4540 + }, + { + "epoch": 0.6368863955119215, + "grad_norm": 2.2229207556942265, + "learning_rate": 3.0771442170985344e-06, + "loss": 0.4049, + "step": 4541 + }, + { + "epoch": 0.6370266479663395, + "grad_norm": 2.2035882341080124, + "learning_rate": 3.0750478354716463e-06, + "loss": 0.3522, + "step": 4542 + }, + { + "epoch": 0.6371669004207574, + "grad_norm": 1.7015928323527512, + "learning_rate": 3.0729518510451888e-06, + "loss": 0.2752, + "step": 4543 + }, + { + "epoch": 0.6373071528751754, + "grad_norm": 2.2296046839774006, + "learning_rate": 3.0708562642516538e-06, + "loss": 0.3118, + "step": 4544 + }, + { + "epoch": 0.6374474053295933, + "grad_norm": 1.9898353004351053, + "learning_rate": 3.068761075523451e-06, + "loss": 0.41, + "step": 4545 + }, + { + "epoch": 0.6375876577840113, + "grad_norm": 1.9692703649289445, + "learning_rate": 3.0666662852929063e-06, + "loss": 0.3181, + "step": 4546 + }, + { + "epoch": 0.6377279102384291, + "grad_norm": 1.9840463080731596, + "learning_rate": 3.0645718939922668e-06, + "loss": 0.3099, + "step": 4547 + }, + { + "epoch": 0.6378681626928471, + "grad_norm": 2.123101901382688, + "learning_rate": 3.0624779020536955e-06, + "loss": 0.3696, + "step": 4548 + }, + { + "epoch": 0.638008415147265, + "grad_norm": 2.1057792250647633, + "learning_rate": 3.0603843099092713e-06, + "loss": 0.3504, + "step": 4549 + }, + { + "epoch": 0.638148667601683, + "grad_norm": 2.5099752731929046, + "learning_rate": 3.058291117990996e-06, + "loss": 0.3683, + "step": 4550 + }, + { + "epoch": 0.6382889200561009, + "grad_norm": 2.002760412687919, + "learning_rate": 3.0561983267307803e-06, + "loss": 0.3304, + "step": 4551 + }, + { + "epoch": 0.6384291725105189, + "grad_norm": 1.9658643980052655, + "learning_rate": 3.0541059365604597e-06, + "loss": 0.3564, + "step": 4552 + }, + { + "epoch": 0.6385694249649368, + "grad_norm": 2.0715207586774484, + "learning_rate": 3.0520139479117844e-06, + "loss": 0.3732, + "step": 4553 + }, + { + "epoch": 0.6387096774193548, + "grad_norm": 2.3694462339278464, + "learning_rate": 3.049922361216422e-06, + "loss": 0.3856, + "step": 4554 + }, + { + "epoch": 0.6388499298737728, + "grad_norm": 4.295252712218037, + "learning_rate": 3.0478311769059554e-06, + "loss": 0.351, + "step": 4555 + }, + { + "epoch": 0.6389901823281907, + "grad_norm": 2.049159747723904, + "learning_rate": 3.045740395411886e-06, + "loss": 0.3388, + "step": 4556 + }, + { + "epoch": 0.6391304347826087, + "grad_norm": 2.5809980625114277, + "learning_rate": 3.0436500171656327e-06, + "loss": 0.3509, + "step": 4557 + }, + { + "epoch": 0.6392706872370266, + "grad_norm": 2.0725467933369535, + "learning_rate": 3.041560042598532e-06, + "loss": 0.3394, + "step": 4558 + }, + { + "epoch": 0.6394109396914446, + "grad_norm": 2.1502446013304013, + "learning_rate": 3.039470472141832e-06, + "loss": 0.3715, + "step": 4559 + }, + { + "epoch": 0.6395511921458625, + "grad_norm": 1.933816792888678, + "learning_rate": 3.0373813062267025e-06, + "loss": 0.3517, + "step": 4560 + }, + { + "epoch": 0.6396914446002805, + "grad_norm": 2.4159210852688426, + "learning_rate": 3.03529254528423e-06, + "loss": 0.3431, + "step": 4561 + }, + { + "epoch": 0.6398316970546984, + "grad_norm": 1.8660322629790949, + "learning_rate": 3.033204189745413e-06, + "loss": 0.3974, + "step": 4562 + }, + { + "epoch": 0.6399719495091164, + "grad_norm": 1.8247432893984266, + "learning_rate": 3.0311162400411697e-06, + "loss": 0.3155, + "step": 4563 + }, + { + "epoch": 0.6401122019635344, + "grad_norm": 1.883526298742388, + "learning_rate": 3.0290286966023353e-06, + "loss": 0.3252, + "step": 4564 + }, + { + "epoch": 0.6402524544179523, + "grad_norm": 2.004238168097906, + "learning_rate": 3.0269415598596604e-06, + "loss": 0.3367, + "step": 4565 + }, + { + "epoch": 0.6403927068723703, + "grad_norm": 1.7507196036792376, + "learning_rate": 3.024854830243808e-06, + "loss": 0.3485, + "step": 4566 + }, + { + "epoch": 0.6405329593267882, + "grad_norm": 1.8850351723948737, + "learning_rate": 3.022768508185362e-06, + "loss": 0.3939, + "step": 4567 + }, + { + "epoch": 0.6406732117812062, + "grad_norm": 2.5476508333814794, + "learning_rate": 3.0206825941148203e-06, + "loss": 0.3769, + "step": 4568 + }, + { + "epoch": 0.6408134642356241, + "grad_norm": 1.986979899941672, + "learning_rate": 3.018597088462597e-06, + "loss": 0.3244, + "step": 4569 + }, + { + "epoch": 0.6409537166900421, + "grad_norm": 3.0521013869097735, + "learning_rate": 3.0165119916590224e-06, + "loss": 0.3706, + "step": 4570 + }, + { + "epoch": 0.64109396914446, + "grad_norm": 1.7491560672162012, + "learning_rate": 3.0144273041343393e-06, + "loss": 0.3718, + "step": 4571 + }, + { + "epoch": 0.641234221598878, + "grad_norm": 2.184723536200053, + "learning_rate": 3.0123430263187092e-06, + "loss": 0.2994, + "step": 4572 + }, + { + "epoch": 0.641374474053296, + "grad_norm": 1.7168160395154741, + "learning_rate": 3.01025915864221e-06, + "loss": 0.3525, + "step": 4573 + }, + { + "epoch": 0.6415147265077139, + "grad_norm": 1.5907500961665788, + "learning_rate": 3.008175701534831e-06, + "loss": 0.3468, + "step": 4574 + }, + { + "epoch": 0.6416549789621319, + "grad_norm": 2.0082021847188276, + "learning_rate": 3.006092655426481e-06, + "loss": 0.3338, + "step": 4575 + }, + { + "epoch": 0.6417952314165498, + "grad_norm": 4.189800256961816, + "learning_rate": 3.00401002074698e-06, + "loss": 0.3742, + "step": 4576 + }, + { + "epoch": 0.6419354838709678, + "grad_norm": 2.153385194246446, + "learning_rate": 3.001927797926067e-06, + "loss": 0.3752, + "step": 4577 + }, + { + "epoch": 0.6420757363253857, + "grad_norm": 2.1230894363474717, + "learning_rate": 2.9998459873933927e-06, + "loss": 0.3738, + "step": 4578 + }, + { + "epoch": 0.6422159887798037, + "grad_norm": 1.9055915070390135, + "learning_rate": 2.997764589578527e-06, + "loss": 0.3081, + "step": 4579 + }, + { + "epoch": 0.6423562412342216, + "grad_norm": 1.891160305944515, + "learning_rate": 2.995683604910947e-06, + "loss": 0.36, + "step": 4580 + }, + { + "epoch": 0.6424964936886396, + "grad_norm": 2.1911503443495084, + "learning_rate": 2.9936030338200527e-06, + "loss": 0.3469, + "step": 4581 + }, + { + "epoch": 0.6426367461430575, + "grad_norm": 1.6665888847638703, + "learning_rate": 2.991522876735154e-06, + "loss": 0.3441, + "step": 4582 + }, + { + "epoch": 0.6427769985974755, + "grad_norm": 2.2880495714072806, + "learning_rate": 2.989443134085477e-06, + "loss": 0.3461, + "step": 4583 + }, + { + "epoch": 0.6429172510518935, + "grad_norm": 1.7869778944333592, + "learning_rate": 2.9873638063001633e-06, + "loss": 0.3367, + "step": 4584 + }, + { + "epoch": 0.6430575035063114, + "grad_norm": 2.4827918040407484, + "learning_rate": 2.9852848938082657e-06, + "loss": 0.3684, + "step": 4585 + }, + { + "epoch": 0.6431977559607294, + "grad_norm": 1.9377946321979833, + "learning_rate": 2.983206397038756e-06, + "loss": 0.3512, + "step": 4586 + }, + { + "epoch": 0.6433380084151472, + "grad_norm": 2.1198192722731886, + "learning_rate": 2.981128316420515e-06, + "loss": 0.3193, + "step": 4587 + }, + { + "epoch": 0.6434782608695652, + "grad_norm": 1.8526557881567933, + "learning_rate": 2.97905065238234e-06, + "loss": 0.328, + "step": 4588 + }, + { + "epoch": 0.6436185133239831, + "grad_norm": 2.0941407707284037, + "learning_rate": 2.9769734053529443e-06, + "loss": 0.3047, + "step": 4589 + }, + { + "epoch": 0.6437587657784011, + "grad_norm": 1.9440640652700383, + "learning_rate": 2.974896575760952e-06, + "loss": 0.3708, + "step": 4590 + }, + { + "epoch": 0.643899018232819, + "grad_norm": 2.494536820192507, + "learning_rate": 2.972820164034904e-06, + "loss": 0.3809, + "step": 4591 + }, + { + "epoch": 0.644039270687237, + "grad_norm": 1.8840684935638463, + "learning_rate": 2.9707441706032515e-06, + "loss": 0.3424, + "step": 4592 + }, + { + "epoch": 0.6441795231416549, + "grad_norm": 1.9816701281137397, + "learning_rate": 2.968668595894361e-06, + "loss": 0.329, + "step": 4593 + }, + { + "epoch": 0.6443197755960729, + "grad_norm": 1.8141631684147626, + "learning_rate": 2.9665934403365148e-06, + "loss": 0.3041, + "step": 4594 + }, + { + "epoch": 0.6444600280504909, + "grad_norm": 1.7478385247036152, + "learning_rate": 2.964518704357906e-06, + "loss": 0.3424, + "step": 4595 + }, + { + "epoch": 0.6446002805049088, + "grad_norm": 2.006157337556306, + "learning_rate": 2.9624443883866403e-06, + "loss": 0.3469, + "step": 4596 + }, + { + "epoch": 0.6447405329593268, + "grad_norm": 2.3234138697927795, + "learning_rate": 2.9603704928507406e-06, + "loss": 0.3881, + "step": 4597 + }, + { + "epoch": 0.6448807854137447, + "grad_norm": 2.5939210582535885, + "learning_rate": 2.958297018178139e-06, + "loss": 0.3583, + "step": 4598 + }, + { + "epoch": 0.6450210378681627, + "grad_norm": 1.9126333078037303, + "learning_rate": 2.956223964796685e-06, + "loss": 0.3154, + "step": 4599 + }, + { + "epoch": 0.6451612903225806, + "grad_norm": 2.36846486491677, + "learning_rate": 2.9541513331341353e-06, + "loss": 0.3476, + "step": 4600 + }, + { + "epoch": 0.6453015427769986, + "grad_norm": 1.992982174818043, + "learning_rate": 2.9520791236181645e-06, + "loss": 0.3524, + "step": 4601 + }, + { + "epoch": 0.6454417952314165, + "grad_norm": 1.7403118488778395, + "learning_rate": 2.9500073366763593e-06, + "loss": 0.3335, + "step": 4602 + }, + { + "epoch": 0.6455820476858345, + "grad_norm": 2.1247540499900732, + "learning_rate": 2.947935972736217e-06, + "loss": 0.3488, + "step": 4603 + }, + { + "epoch": 0.6457223001402524, + "grad_norm": 2.2549819891436615, + "learning_rate": 2.9458650322251505e-06, + "loss": 0.371, + "step": 4604 + }, + { + "epoch": 0.6458625525946704, + "grad_norm": 2.4169141598831962, + "learning_rate": 2.943794515570483e-06, + "loss": 0.3189, + "step": 4605 + }, + { + "epoch": 0.6460028050490884, + "grad_norm": 2.3737820965935676, + "learning_rate": 2.941724423199451e-06, + "loss": 0.3507, + "step": 4606 + }, + { + "epoch": 0.6461430575035063, + "grad_norm": 2.5483753489125798, + "learning_rate": 2.9396547555392054e-06, + "loss": 0.3329, + "step": 4607 + }, + { + "epoch": 0.6462833099579243, + "grad_norm": 2.061467345688524, + "learning_rate": 2.9375855130168046e-06, + "loss": 0.321, + "step": 4608 + }, + { + "epoch": 0.6464235624123422, + "grad_norm": 1.9846043491739127, + "learning_rate": 2.9355166960592242e-06, + "loss": 0.3931, + "step": 4609 + }, + { + "epoch": 0.6465638148667602, + "grad_norm": 1.7651755512023444, + "learning_rate": 2.9334483050933506e-06, + "loss": 0.3799, + "step": 4610 + }, + { + "epoch": 0.6467040673211781, + "grad_norm": 2.0617031523247142, + "learning_rate": 2.9313803405459816e-06, + "loss": 0.3735, + "step": 4611 + }, + { + "epoch": 0.6468443197755961, + "grad_norm": 2.5427692294040187, + "learning_rate": 2.929312802843826e-06, + "loss": 0.3287, + "step": 4612 + }, + { + "epoch": 0.646984572230014, + "grad_norm": 1.8859294243286475, + "learning_rate": 2.927245692413507e-06, + "loss": 0.3637, + "step": 4613 + }, + { + "epoch": 0.647124824684432, + "grad_norm": 2.110236437056754, + "learning_rate": 2.925179009681557e-06, + "loss": 0.3319, + "step": 4614 + }, + { + "epoch": 0.64726507713885, + "grad_norm": 2.1491777843923896, + "learning_rate": 2.923112755074423e-06, + "loss": 0.3268, + "step": 4615 + }, + { + "epoch": 0.6474053295932679, + "grad_norm": 2.699141549581143, + "learning_rate": 2.9210469290184627e-06, + "loss": 0.3449, + "step": 4616 + }, + { + "epoch": 0.6475455820476859, + "grad_norm": 2.2763101054283967, + "learning_rate": 2.9189815319399422e-06, + "loss": 0.3362, + "step": 4617 + }, + { + "epoch": 0.6476858345021038, + "grad_norm": 3.4246626694784466, + "learning_rate": 2.9169165642650467e-06, + "loss": 0.3431, + "step": 4618 + }, + { + "epoch": 0.6478260869565218, + "grad_norm": 2.0543354153395357, + "learning_rate": 2.914852026419862e-06, + "loss": 0.3756, + "step": 4619 + }, + { + "epoch": 0.6479663394109397, + "grad_norm": 1.6626460534668486, + "learning_rate": 2.9127879188303954e-06, + "loss": 0.3101, + "step": 4620 + }, + { + "epoch": 0.6481065918653577, + "grad_norm": 3.074064103033329, + "learning_rate": 2.910724241922558e-06, + "loss": 0.3574, + "step": 4621 + }, + { + "epoch": 0.6482468443197756, + "grad_norm": 1.8684943186913718, + "learning_rate": 2.9086609961221758e-06, + "loss": 0.3604, + "step": 4622 + }, + { + "epoch": 0.6483870967741936, + "grad_norm": 2.2664296792997876, + "learning_rate": 2.906598181854986e-06, + "loss": 0.349, + "step": 4623 + }, + { + "epoch": 0.6485273492286115, + "grad_norm": 2.326094490769146, + "learning_rate": 2.904535799546636e-06, + "loss": 0.3347, + "step": 4624 + }, + { + "epoch": 0.6486676016830295, + "grad_norm": 1.6826762024664168, + "learning_rate": 2.902473849622683e-06, + "loss": 0.3217, + "step": 4625 + }, + { + "epoch": 0.6488078541374475, + "grad_norm": 7.8489513401404505, + "learning_rate": 2.9004123325085976e-06, + "loss": 0.3536, + "step": 4626 + }, + { + "epoch": 0.6489481065918653, + "grad_norm": 1.899992380628868, + "learning_rate": 2.8983512486297582e-06, + "loss": 0.3405, + "step": 4627 + }, + { + "epoch": 0.6490883590462833, + "grad_norm": 1.7764575132082756, + "learning_rate": 2.8962905984114553e-06, + "loss": 0.3673, + "step": 4628 + }, + { + "epoch": 0.6492286115007012, + "grad_norm": 3.2582564017213116, + "learning_rate": 2.8942303822788916e-06, + "loss": 0.3746, + "step": 4629 + }, + { + "epoch": 0.6493688639551192, + "grad_norm": 1.8176649101193338, + "learning_rate": 2.8921706006571744e-06, + "loss": 0.3473, + "step": 4630 + }, + { + "epoch": 0.6495091164095371, + "grad_norm": 2.4060758057002025, + "learning_rate": 2.890111253971327e-06, + "loss": 0.3359, + "step": 4631 + }, + { + "epoch": 0.6496493688639551, + "grad_norm": 3.275301090988934, + "learning_rate": 2.8880523426462824e-06, + "loss": 0.359, + "step": 4632 + }, + { + "epoch": 0.649789621318373, + "grad_norm": 2.3130684231319947, + "learning_rate": 2.885993867106881e-06, + "loss": 0.3601, + "step": 4633 + }, + { + "epoch": 0.649929873772791, + "grad_norm": 2.5167180879236417, + "learning_rate": 2.8839358277778758e-06, + "loss": 0.3677, + "step": 4634 + }, + { + "epoch": 0.6500701262272089, + "grad_norm": 3.0954326330959225, + "learning_rate": 2.8818782250839282e-06, + "loss": 0.3736, + "step": 4635 + }, + { + "epoch": 0.6502103786816269, + "grad_norm": 2.5139943399904277, + "learning_rate": 2.879821059449611e-06, + "loss": 0.3666, + "step": 4636 + }, + { + "epoch": 0.6503506311360449, + "grad_norm": 1.7542489785328284, + "learning_rate": 2.8777643312994046e-06, + "loss": 0.311, + "step": 4637 + }, + { + "epoch": 0.6504908835904628, + "grad_norm": 2.3791102931018484, + "learning_rate": 2.8757080410577042e-06, + "loss": 0.3578, + "step": 4638 + }, + { + "epoch": 0.6506311360448808, + "grad_norm": 2.3984617548905907, + "learning_rate": 2.8736521891488057e-06, + "loss": 0.3409, + "step": 4639 + }, + { + "epoch": 0.6507713884992987, + "grad_norm": 3.3016425639824027, + "learning_rate": 2.8715967759969222e-06, + "loss": 0.3811, + "step": 4640 + }, + { + "epoch": 0.6509116409537167, + "grad_norm": 1.7915204084566343, + "learning_rate": 2.8695418020261755e-06, + "loss": 0.3916, + "step": 4641 + }, + { + "epoch": 0.6510518934081346, + "grad_norm": 2.8164749031323475, + "learning_rate": 2.8674872676605914e-06, + "loss": 0.3928, + "step": 4642 + }, + { + "epoch": 0.6511921458625526, + "grad_norm": 1.8285579331785564, + "learning_rate": 2.8654331733241113e-06, + "loss": 0.315, + "step": 4643 + }, + { + "epoch": 0.6513323983169705, + "grad_norm": 3.862429626218797, + "learning_rate": 2.8633795194405824e-06, + "loss": 0.3694, + "step": 4644 + }, + { + "epoch": 0.6514726507713885, + "grad_norm": 1.9473305770770066, + "learning_rate": 2.8613263064337617e-06, + "loss": 0.3355, + "step": 4645 + }, + { + "epoch": 0.6516129032258065, + "grad_norm": 1.9278602133881784, + "learning_rate": 2.859273534727316e-06, + "loss": 0.3497, + "step": 4646 + }, + { + "epoch": 0.6517531556802244, + "grad_norm": 2.398239475487615, + "learning_rate": 2.8572212047448196e-06, + "loss": 0.3661, + "step": 4647 + }, + { + "epoch": 0.6518934081346424, + "grad_norm": 2.635364868700427, + "learning_rate": 2.8551693169097573e-06, + "loss": 0.291, + "step": 4648 + }, + { + "epoch": 0.6520336605890603, + "grad_norm": 2.59978582527105, + "learning_rate": 2.8531178716455217e-06, + "loss": 0.3385, + "step": 4649 + }, + { + "epoch": 0.6521739130434783, + "grad_norm": 1.8331195388749015, + "learning_rate": 2.8510668693754157e-06, + "loss": 0.3514, + "step": 4650 + }, + { + "epoch": 0.6523141654978962, + "grad_norm": 1.8445192445706031, + "learning_rate": 2.8490163105226454e-06, + "loss": 0.3045, + "step": 4651 + }, + { + "epoch": 0.6524544179523142, + "grad_norm": 1.6860395398871482, + "learning_rate": 2.846966195510332e-06, + "loss": 0.3466, + "step": 4652 + }, + { + "epoch": 0.6525946704067321, + "grad_norm": 1.9178165289923703, + "learning_rate": 2.844916524761502e-06, + "loss": 0.3136, + "step": 4653 + }, + { + "epoch": 0.6527349228611501, + "grad_norm": 2.1315857429842233, + "learning_rate": 2.8428672986990894e-06, + "loss": 0.3923, + "step": 4654 + }, + { + "epoch": 0.652875175315568, + "grad_norm": 2.0212318446960578, + "learning_rate": 2.84081851774594e-06, + "loss": 0.3653, + "step": 4655 + }, + { + "epoch": 0.653015427769986, + "grad_norm": 3.0373076640957057, + "learning_rate": 2.8387701823248035e-06, + "loss": 0.3499, + "step": 4656 + }, + { + "epoch": 0.653155680224404, + "grad_norm": 2.061589126588437, + "learning_rate": 2.8367222928583403e-06, + "loss": 0.3396, + "step": 4657 + }, + { + "epoch": 0.6532959326788219, + "grad_norm": 2.2320376889112916, + "learning_rate": 2.834674849769119e-06, + "loss": 0.353, + "step": 4658 + }, + { + "epoch": 0.6534361851332399, + "grad_norm": 2.42074292704061, + "learning_rate": 2.8326278534796154e-06, + "loss": 0.2906, + "step": 4659 + }, + { + "epoch": 0.6535764375876578, + "grad_norm": 2.0843648518511158, + "learning_rate": 2.83058130441221e-06, + "loss": 0.3169, + "step": 4660 + }, + { + "epoch": 0.6537166900420758, + "grad_norm": 1.9394999610800565, + "learning_rate": 2.8285352029891957e-06, + "loss": 0.3225, + "step": 4661 + }, + { + "epoch": 0.6538569424964937, + "grad_norm": 2.0162747540878865, + "learning_rate": 2.826489549632773e-06, + "loss": 0.3499, + "step": 4662 + }, + { + "epoch": 0.6539971949509117, + "grad_norm": 2.064376901827244, + "learning_rate": 2.8244443447650448e-06, + "loss": 0.3106, + "step": 4663 + }, + { + "epoch": 0.6541374474053296, + "grad_norm": 2.305044269426271, + "learning_rate": 2.8223995888080263e-06, + "loss": 0.3198, + "step": 4664 + }, + { + "epoch": 0.6542776998597476, + "grad_norm": 1.5590281251691258, + "learning_rate": 2.8203552821836388e-06, + "loss": 0.297, + "step": 4665 + }, + { + "epoch": 0.6544179523141656, + "grad_norm": 1.9711392754594148, + "learning_rate": 2.81831142531371e-06, + "loss": 0.3417, + "step": 4666 + }, + { + "epoch": 0.6545582047685834, + "grad_norm": 2.334647852872854, + "learning_rate": 2.816268018619977e-06, + "loss": 0.3249, + "step": 4667 + }, + { + "epoch": 0.6546984572230014, + "grad_norm": 2.3746166923800525, + "learning_rate": 2.8142250625240806e-06, + "loss": 0.3302, + "step": 4668 + }, + { + "epoch": 0.6548387096774193, + "grad_norm": 2.3418481867293406, + "learning_rate": 2.8121825574475727e-06, + "loss": 0.3391, + "step": 4669 + }, + { + "epoch": 0.6549789621318373, + "grad_norm": 2.4825000525529286, + "learning_rate": 2.81014050381191e-06, + "loss": 0.3246, + "step": 4670 + }, + { + "epoch": 0.6551192145862552, + "grad_norm": 1.8045615280244405, + "learning_rate": 2.808098902038453e-06, + "loss": 0.302, + "step": 4671 + }, + { + "epoch": 0.6552594670406732, + "grad_norm": 2.0861615121404977, + "learning_rate": 2.8060577525484735e-06, + "loss": 0.3293, + "step": 4672 + }, + { + "epoch": 0.6553997194950911, + "grad_norm": 2.5970633611753158, + "learning_rate": 2.804017055763149e-06, + "loss": 0.3702, + "step": 4673 + }, + { + "epoch": 0.6555399719495091, + "grad_norm": 2.614142625199795, + "learning_rate": 2.8019768121035627e-06, + "loss": 0.4004, + "step": 4674 + }, + { + "epoch": 0.655680224403927, + "grad_norm": 2.2539857952817783, + "learning_rate": 2.799937021990704e-06, + "loss": 0.3261, + "step": 4675 + }, + { + "epoch": 0.655820476858345, + "grad_norm": 2.5801004875190037, + "learning_rate": 2.797897685845471e-06, + "loss": 0.3352, + "step": 4676 + }, + { + "epoch": 0.655960729312763, + "grad_norm": 1.7454343229038092, + "learning_rate": 2.7958588040886647e-06, + "loss": 0.3847, + "step": 4677 + }, + { + "epoch": 0.6561009817671809, + "grad_norm": 2.012056962998581, + "learning_rate": 2.7938203771409945e-06, + "loss": 0.3486, + "step": 4678 + }, + { + "epoch": 0.6562412342215989, + "grad_norm": 1.981845715612273, + "learning_rate": 2.7917824054230787e-06, + "loss": 0.3521, + "step": 4679 + }, + { + "epoch": 0.6563814866760168, + "grad_norm": 1.912802830574912, + "learning_rate": 2.7897448893554335e-06, + "loss": 0.3773, + "step": 4680 + }, + { + "epoch": 0.6565217391304348, + "grad_norm": 1.8860624763399523, + "learning_rate": 2.787707829358488e-06, + "loss": 0.3298, + "step": 4681 + }, + { + "epoch": 0.6566619915848527, + "grad_norm": 2.11202361925963, + "learning_rate": 2.7856712258525755e-06, + "loss": 0.3467, + "step": 4682 + }, + { + "epoch": 0.6568022440392707, + "grad_norm": 2.4087430251316984, + "learning_rate": 2.783635079257937e-06, + "loss": 0.3528, + "step": 4683 + }, + { + "epoch": 0.6569424964936886, + "grad_norm": 1.9470473064755167, + "learning_rate": 2.7815993899947135e-06, + "loss": 0.3713, + "step": 4684 + }, + { + "epoch": 0.6570827489481066, + "grad_norm": 2.473218905054572, + "learning_rate": 2.779564158482957e-06, + "loss": 0.3633, + "step": 4685 + }, + { + "epoch": 0.6572230014025245, + "grad_norm": 2.5197186067693758, + "learning_rate": 2.7775293851426233e-06, + "loss": 0.3765, + "step": 4686 + }, + { + "epoch": 0.6573632538569425, + "grad_norm": 2.4310388272321974, + "learning_rate": 2.7754950703935735e-06, + "loss": 0.324, + "step": 4687 + }, + { + "epoch": 0.6575035063113605, + "grad_norm": 1.8321771949922003, + "learning_rate": 2.7734612146555738e-06, + "loss": 0.3663, + "step": 4688 + }, + { + "epoch": 0.6576437587657784, + "grad_norm": 2.2703847412881637, + "learning_rate": 2.7714278183482967e-06, + "loss": 0.3459, + "step": 4689 + }, + { + "epoch": 0.6577840112201964, + "grad_norm": 1.8240195580153358, + "learning_rate": 2.7693948818913197e-06, + "loss": 0.3, + "step": 4690 + }, + { + "epoch": 0.6579242636746143, + "grad_norm": 2.76302400685283, + "learning_rate": 2.767362405704126e-06, + "loss": 0.3025, + "step": 4691 + }, + { + "epoch": 0.6580645161290323, + "grad_norm": 1.637770212606875, + "learning_rate": 2.7653303902061e-06, + "loss": 0.3043, + "step": 4692 + }, + { + "epoch": 0.6582047685834502, + "grad_norm": 1.752883034938037, + "learning_rate": 2.763298835816535e-06, + "loss": 0.3988, + "step": 4693 + }, + { + "epoch": 0.6583450210378682, + "grad_norm": 2.000782932577204, + "learning_rate": 2.761267742954629e-06, + "loss": 0.3642, + "step": 4694 + }, + { + "epoch": 0.6584852734922861, + "grad_norm": 2.5979672812206878, + "learning_rate": 2.7592371120394825e-06, + "loss": 0.3408, + "step": 4695 + }, + { + "epoch": 0.6586255259467041, + "grad_norm": 2.2892040430254896, + "learning_rate": 2.757206943490103e-06, + "loss": 0.3333, + "step": 4696 + }, + { + "epoch": 0.658765778401122, + "grad_norm": 1.7572594344780899, + "learning_rate": 2.7551772377254018e-06, + "loss": 0.3216, + "step": 4697 + }, + { + "epoch": 0.65890603085554, + "grad_norm": 2.191060030373805, + "learning_rate": 2.7531479951641928e-06, + "loss": 0.3497, + "step": 4698 + }, + { + "epoch": 0.659046283309958, + "grad_norm": 1.9372309221033999, + "learning_rate": 2.751119216225198e-06, + "loss": 0.3376, + "step": 4699 + }, + { + "epoch": 0.6591865357643759, + "grad_norm": 2.025164953036077, + "learning_rate": 2.749090901327043e-06, + "loss": 0.3649, + "step": 4700 + }, + { + "epoch": 0.6593267882187939, + "grad_norm": 2.416887525353876, + "learning_rate": 2.7470630508882525e-06, + "loss": 0.3229, + "step": 4701 + }, + { + "epoch": 0.6594670406732118, + "grad_norm": 2.5935240497757364, + "learning_rate": 2.7450356653272614e-06, + "loss": 0.3489, + "step": 4702 + }, + { + "epoch": 0.6596072931276298, + "grad_norm": 1.8588028997347341, + "learning_rate": 2.7430087450624053e-06, + "loss": 0.344, + "step": 4703 + }, + { + "epoch": 0.6597475455820477, + "grad_norm": 2.1908006018158073, + "learning_rate": 2.740982290511929e-06, + "loss": 0.3292, + "step": 4704 + }, + { + "epoch": 0.6598877980364657, + "grad_norm": 2.266288502319294, + "learning_rate": 2.7389563020939724e-06, + "loss": 0.3525, + "step": 4705 + }, + { + "epoch": 0.6600280504908836, + "grad_norm": 2.1414814370450985, + "learning_rate": 2.7369307802265854e-06, + "loss": 0.3371, + "step": 4706 + }, + { + "epoch": 0.6601683029453015, + "grad_norm": 1.9773840120542923, + "learning_rate": 2.734905725327721e-06, + "loss": 0.3522, + "step": 4707 + }, + { + "epoch": 0.6603085553997194, + "grad_norm": 2.7644149805671248, + "learning_rate": 2.7328811378152355e-06, + "loss": 0.3436, + "step": 4708 + }, + { + "epoch": 0.6604488078541374, + "grad_norm": 1.6660650677516469, + "learning_rate": 2.7308570181068872e-06, + "loss": 0.3189, + "step": 4709 + }, + { + "epoch": 0.6605890603085554, + "grad_norm": 2.2498889471834387, + "learning_rate": 2.72883336662034e-06, + "loss": 0.338, + "step": 4710 + }, + { + "epoch": 0.6607293127629733, + "grad_norm": 2.072071983213755, + "learning_rate": 2.726810183773162e-06, + "loss": 0.315, + "step": 4711 + }, + { + "epoch": 0.6608695652173913, + "grad_norm": 1.8845091333934638, + "learning_rate": 2.7247874699828186e-06, + "loss": 0.3219, + "step": 4712 + }, + { + "epoch": 0.6610098176718092, + "grad_norm": 5.763176788504465, + "learning_rate": 2.7227652256666848e-06, + "loss": 0.3359, + "step": 4713 + }, + { + "epoch": 0.6611500701262272, + "grad_norm": 1.8985609477529983, + "learning_rate": 2.7207434512420374e-06, + "loss": 0.3844, + "step": 4714 + }, + { + "epoch": 0.6612903225806451, + "grad_norm": 2.5145070689935056, + "learning_rate": 2.718722147126054e-06, + "loss": 0.3305, + "step": 4715 + }, + { + "epoch": 0.6614305750350631, + "grad_norm": 1.9832996849069426, + "learning_rate": 2.7167013137358173e-06, + "loss": 0.325, + "step": 4716 + }, + { + "epoch": 0.661570827489481, + "grad_norm": 2.008781941224616, + "learning_rate": 2.714680951488312e-06, + "loss": 0.3624, + "step": 4717 + }, + { + "epoch": 0.661711079943899, + "grad_norm": 1.9941882091378325, + "learning_rate": 2.7126610608004263e-06, + "loss": 0.34, + "step": 4718 + }, + { + "epoch": 0.661851332398317, + "grad_norm": 2.385054837122267, + "learning_rate": 2.71064164208895e-06, + "loss": 0.3251, + "step": 4719 + }, + { + "epoch": 0.6619915848527349, + "grad_norm": 2.4934164144406368, + "learning_rate": 2.7086226957705773e-06, + "loss": 0.3786, + "step": 4720 + }, + { + "epoch": 0.6621318373071529, + "grad_norm": 2.09854067550914, + "learning_rate": 2.7066042222619017e-06, + "loss": 0.3854, + "step": 4721 + }, + { + "epoch": 0.6622720897615708, + "grad_norm": 1.8302577692895277, + "learning_rate": 2.704586221979422e-06, + "loss": 0.3195, + "step": 4722 + }, + { + "epoch": 0.6624123422159888, + "grad_norm": 3.1317948284252393, + "learning_rate": 2.7025686953395368e-06, + "loss": 0.3644, + "step": 4723 + }, + { + "epoch": 0.6625525946704067, + "grad_norm": 1.7526700223327374, + "learning_rate": 2.7005516427585537e-06, + "loss": 0.3281, + "step": 4724 + }, + { + "epoch": 0.6626928471248247, + "grad_norm": 1.7663058428295533, + "learning_rate": 2.6985350646526713e-06, + "loss": 0.3556, + "step": 4725 + }, + { + "epoch": 0.6628330995792426, + "grad_norm": 2.7559095522761896, + "learning_rate": 2.6965189614379995e-06, + "loss": 0.3546, + "step": 4726 + }, + { + "epoch": 0.6629733520336606, + "grad_norm": 1.6614809742244903, + "learning_rate": 2.6945033335305458e-06, + "loss": 0.3072, + "step": 4727 + }, + { + "epoch": 0.6631136044880785, + "grad_norm": 2.2053696040192516, + "learning_rate": 2.6924881813462225e-06, + "loss": 0.365, + "step": 4728 + }, + { + "epoch": 0.6632538569424965, + "grad_norm": 1.6586938498105346, + "learning_rate": 2.6904735053008405e-06, + "loss": 0.3457, + "step": 4729 + }, + { + "epoch": 0.6633941093969145, + "grad_norm": 2.810974385484437, + "learning_rate": 2.688459305810116e-06, + "loss": 0.3483, + "step": 4730 + }, + { + "epoch": 0.6635343618513324, + "grad_norm": 2.446048711059328, + "learning_rate": 2.6864455832896633e-06, + "loss": 0.3705, + "step": 4731 + }, + { + "epoch": 0.6636746143057504, + "grad_norm": 1.9848479660963572, + "learning_rate": 2.684432338155003e-06, + "loss": 0.3039, + "step": 4732 + }, + { + "epoch": 0.6638148667601683, + "grad_norm": 1.9624996574061393, + "learning_rate": 2.6824195708215504e-06, + "loss": 0.2931, + "step": 4733 + }, + { + "epoch": 0.6639551192145863, + "grad_norm": 2.279642453183277, + "learning_rate": 2.6804072817046266e-06, + "loss": 0.3284, + "step": 4734 + }, + { + "epoch": 0.6640953716690042, + "grad_norm": 2.1523510609137406, + "learning_rate": 2.678395471219455e-06, + "loss": 0.3261, + "step": 4735 + }, + { + "epoch": 0.6642356241234222, + "grad_norm": 3.236647540061164, + "learning_rate": 2.6763841397811576e-06, + "loss": 0.3104, + "step": 4736 + }, + { + "epoch": 0.6643758765778401, + "grad_norm": 2.3175174820291, + "learning_rate": 2.674373287804759e-06, + "loss": 0.3223, + "step": 4737 + }, + { + "epoch": 0.6645161290322581, + "grad_norm": 2.044567059154848, + "learning_rate": 2.6723629157051844e-06, + "loss": 0.3573, + "step": 4738 + }, + { + "epoch": 0.664656381486676, + "grad_norm": 1.8697812249220616, + "learning_rate": 2.6703530238972597e-06, + "loss": 0.3498, + "step": 4739 + }, + { + "epoch": 0.664796633941094, + "grad_norm": 1.8125449207921789, + "learning_rate": 2.6683436127957122e-06, + "loss": 0.3434, + "step": 4740 + }, + { + "epoch": 0.664936886395512, + "grad_norm": 2.433619026065066, + "learning_rate": 2.6663346828151727e-06, + "loss": 0.3327, + "step": 4741 + }, + { + "epoch": 0.6650771388499299, + "grad_norm": 2.9677482119382854, + "learning_rate": 2.664326234370164e-06, + "loss": 0.3943, + "step": 4742 + }, + { + "epoch": 0.6652173913043479, + "grad_norm": 3.028960946577812, + "learning_rate": 2.662318267875119e-06, + "loss": 0.3211, + "step": 4743 + }, + { + "epoch": 0.6653576437587658, + "grad_norm": 1.8940958368249898, + "learning_rate": 2.6603107837443675e-06, + "loss": 0.318, + "step": 4744 + }, + { + "epoch": 0.6654978962131838, + "grad_norm": 1.8530857564492274, + "learning_rate": 2.658303782392141e-06, + "loss": 0.3508, + "step": 4745 + }, + { + "epoch": 0.6656381486676017, + "grad_norm": 1.537619984604868, + "learning_rate": 2.656297264232567e-06, + "loss": 0.3237, + "step": 4746 + }, + { + "epoch": 0.6657784011220196, + "grad_norm": 1.6202378552862653, + "learning_rate": 2.654291229679678e-06, + "loss": 0.3291, + "step": 4747 + }, + { + "epoch": 0.6659186535764375, + "grad_norm": 1.8886748415461125, + "learning_rate": 2.652285679147405e-06, + "loss": 0.357, + "step": 4748 + }, + { + "epoch": 0.6660589060308555, + "grad_norm": 2.0224770877327765, + "learning_rate": 2.65028061304958e-06, + "loss": 0.3423, + "step": 4749 + }, + { + "epoch": 0.6661991584852734, + "grad_norm": 2.8815385851867266, + "learning_rate": 2.6482760317999338e-06, + "loss": 0.3389, + "step": 4750 + }, + { + "epoch": 0.6663394109396914, + "grad_norm": 2.3082438298947254, + "learning_rate": 2.6462719358120983e-06, + "loss": 0.3756, + "step": 4751 + }, + { + "epoch": 0.6664796633941094, + "grad_norm": 2.0251263378904842, + "learning_rate": 2.644268325499606e-06, + "loss": 0.3481, + "step": 4752 + }, + { + "epoch": 0.6666199158485273, + "grad_norm": 2.493588523130223, + "learning_rate": 2.642265201275885e-06, + "loss": 0.3424, + "step": 4753 + }, + { + "epoch": 0.6667601683029453, + "grad_norm": 1.9319181946757449, + "learning_rate": 2.640262563554267e-06, + "loss": 0.3397, + "step": 4754 + }, + { + "epoch": 0.6669004207573632, + "grad_norm": 2.442674670499213, + "learning_rate": 2.6382604127479815e-06, + "loss": 0.3937, + "step": 4755 + }, + { + "epoch": 0.6670406732117812, + "grad_norm": 2.0225662064060237, + "learning_rate": 2.636258749270161e-06, + "loss": 0.3022, + "step": 4756 + }, + { + "epoch": 0.6671809256661991, + "grad_norm": 2.3421753743718603, + "learning_rate": 2.634257573533833e-06, + "loss": 0.3411, + "step": 4757 + }, + { + "epoch": 0.6673211781206171, + "grad_norm": 2.2014546656951817, + "learning_rate": 2.632256885951925e-06, + "loss": 0.3402, + "step": 4758 + }, + { + "epoch": 0.667461430575035, + "grad_norm": 2.219863826939898, + "learning_rate": 2.630256686937267e-06, + "loss": 0.3245, + "step": 4759 + }, + { + "epoch": 0.667601683029453, + "grad_norm": 2.4774018318443582, + "learning_rate": 2.6282569769025857e-06, + "loss": 0.3313, + "step": 4760 + }, + { + "epoch": 0.667741935483871, + "grad_norm": 1.5628846267321943, + "learning_rate": 2.6262577562605086e-06, + "loss": 0.3064, + "step": 4761 + }, + { + "epoch": 0.6678821879382889, + "grad_norm": 4.691355022878102, + "learning_rate": 2.6242590254235566e-06, + "loss": 0.3282, + "step": 4762 + }, + { + "epoch": 0.6680224403927069, + "grad_norm": 1.9757068576284977, + "learning_rate": 2.622260784804157e-06, + "loss": 0.3239, + "step": 4763 + }, + { + "epoch": 0.6681626928471248, + "grad_norm": 55.64012917498822, + "learning_rate": 2.6202630348146323e-06, + "loss": 0.3356, + "step": 4764 + }, + { + "epoch": 0.6683029453015428, + "grad_norm": 2.218992300638231, + "learning_rate": 2.6182657758672046e-06, + "loss": 0.356, + "step": 4765 + }, + { + "epoch": 0.6684431977559607, + "grad_norm": 2.0142887624661494, + "learning_rate": 2.616269008373995e-06, + "loss": 0.3301, + "step": 4766 + }, + { + "epoch": 0.6685834502103787, + "grad_norm": 2.291814302254986, + "learning_rate": 2.6142727327470203e-06, + "loss": 0.3951, + "step": 4767 + }, + { + "epoch": 0.6687237026647966, + "grad_norm": 3.155596523809966, + "learning_rate": 2.612276949398199e-06, + "loss": 0.3386, + "step": 4768 + }, + { + "epoch": 0.6688639551192146, + "grad_norm": 2.0918613854869323, + "learning_rate": 2.610281658739347e-06, + "loss": 0.3455, + "step": 4769 + }, + { + "epoch": 0.6690042075736325, + "grad_norm": 1.7799757029986059, + "learning_rate": 2.6082868611821787e-06, + "loss": 0.3454, + "step": 4770 + }, + { + "epoch": 0.6691444600280505, + "grad_norm": 2.3196245708560768, + "learning_rate": 2.606292557138307e-06, + "loss": 0.3836, + "step": 4771 + }, + { + "epoch": 0.6692847124824685, + "grad_norm": 1.8635379766346316, + "learning_rate": 2.6042987470192425e-06, + "loss": 0.3536, + "step": 4772 + }, + { + "epoch": 0.6694249649368864, + "grad_norm": 2.2895219881993363, + "learning_rate": 2.602305431236396e-06, + "loss": 0.3439, + "step": 4773 + }, + { + "epoch": 0.6695652173913044, + "grad_norm": 1.811038908205031, + "learning_rate": 2.6003126102010696e-06, + "loss": 0.3139, + "step": 4774 + }, + { + "epoch": 0.6697054698457223, + "grad_norm": 2.1723970293547663, + "learning_rate": 2.598320284324471e-06, + "loss": 0.3565, + "step": 4775 + }, + { + "epoch": 0.6698457223001403, + "grad_norm": 1.9545669725858683, + "learning_rate": 2.596328454017702e-06, + "loss": 0.3639, + "step": 4776 + }, + { + "epoch": 0.6699859747545582, + "grad_norm": 1.9651293689181057, + "learning_rate": 2.5943371196917633e-06, + "loss": 0.383, + "step": 4777 + }, + { + "epoch": 0.6701262272089762, + "grad_norm": 2.184369490238603, + "learning_rate": 2.592346281757552e-06, + "loss": 0.3377, + "step": 4778 + }, + { + "epoch": 0.6702664796633941, + "grad_norm": 2.2520091305440917, + "learning_rate": 2.590355940625865e-06, + "loss": 0.356, + "step": 4779 + }, + { + "epoch": 0.6704067321178121, + "grad_norm": 1.6715399861027498, + "learning_rate": 2.5883660967073944e-06, + "loss": 0.3716, + "step": 4780 + }, + { + "epoch": 0.67054698457223, + "grad_norm": 2.390283398126288, + "learning_rate": 2.5863767504127313e-06, + "loss": 0.3208, + "step": 4781 + }, + { + "epoch": 0.670687237026648, + "grad_norm": 2.179289969559659, + "learning_rate": 2.5843879021523636e-06, + "loss": 0.3361, + "step": 4782 + }, + { + "epoch": 0.670827489481066, + "grad_norm": 1.8004717447086647, + "learning_rate": 2.582399552336674e-06, + "loss": 0.3465, + "step": 4783 + }, + { + "epoch": 0.6709677419354839, + "grad_norm": 2.6576084029444367, + "learning_rate": 2.5804117013759466e-06, + "loss": 0.3136, + "step": 4784 + }, + { + "epoch": 0.6711079943899019, + "grad_norm": 2.2337519704066096, + "learning_rate": 2.5784243496803596e-06, + "loss": 0.3612, + "step": 4785 + }, + { + "epoch": 0.6712482468443198, + "grad_norm": 3.4833930988447315, + "learning_rate": 2.5764374976599894e-06, + "loss": 0.3155, + "step": 4786 + }, + { + "epoch": 0.6713884992987377, + "grad_norm": 2.3751951438640884, + "learning_rate": 2.574451145724812e-06, + "loss": 0.3162, + "step": 4787 + }, + { + "epoch": 0.6715287517531556, + "grad_norm": 2.1865011932209284, + "learning_rate": 2.5724652942846916e-06, + "loss": 0.3525, + "step": 4788 + }, + { + "epoch": 0.6716690042075736, + "grad_norm": 1.989145626899199, + "learning_rate": 2.5704799437493976e-06, + "loss": 0.3378, + "step": 4789 + }, + { + "epoch": 0.6718092566619915, + "grad_norm": 2.4700663473508038, + "learning_rate": 2.5684950945285937e-06, + "loss": 0.3413, + "step": 4790 + }, + { + "epoch": 0.6719495091164095, + "grad_norm": 1.7890071943638448, + "learning_rate": 2.5665107470318396e-06, + "loss": 0.2885, + "step": 4791 + }, + { + "epoch": 0.6720897615708274, + "grad_norm": 9.053408778479733, + "learning_rate": 2.5645269016685905e-06, + "loss": 0.3846, + "step": 4792 + }, + { + "epoch": 0.6722300140252454, + "grad_norm": 1.8155621936328632, + "learning_rate": 2.5625435588482017e-06, + "loss": 0.39, + "step": 4793 + }, + { + "epoch": 0.6723702664796634, + "grad_norm": 1.5217388366319835, + "learning_rate": 2.5605607189799177e-06, + "loss": 0.3252, + "step": 4794 + }, + { + "epoch": 0.6725105189340813, + "grad_norm": 2.169840456639448, + "learning_rate": 2.558578382472887e-06, + "loss": 0.3354, + "step": 4795 + }, + { + "epoch": 0.6726507713884993, + "grad_norm": 1.7608257322543013, + "learning_rate": 2.5565965497361494e-06, + "loss": 0.3393, + "step": 4796 + }, + { + "epoch": 0.6727910238429172, + "grad_norm": 1.9172696395228257, + "learning_rate": 2.5546152211786428e-06, + "loss": 0.3055, + "step": 4797 + }, + { + "epoch": 0.6729312762973352, + "grad_norm": 2.3876126327709613, + "learning_rate": 2.5526343972092003e-06, + "loss": 0.3662, + "step": 4798 + }, + { + "epoch": 0.6730715287517531, + "grad_norm": 1.622893802480131, + "learning_rate": 2.550654078236552e-06, + "loss": 0.324, + "step": 4799 + }, + { + "epoch": 0.6732117812061711, + "grad_norm": 2.191883382006201, + "learning_rate": 2.5486742646693217e-06, + "loss": 0.3955, + "step": 4800 + }, + { + "epoch": 0.673352033660589, + "grad_norm": 2.545905180155053, + "learning_rate": 2.5466949569160306e-06, + "loss": 0.3654, + "step": 4801 + }, + { + "epoch": 0.673492286115007, + "grad_norm": 2.8056506360081626, + "learning_rate": 2.5447161553850974e-06, + "loss": 0.3629, + "step": 4802 + }, + { + "epoch": 0.673632538569425, + "grad_norm": 2.1400262593948836, + "learning_rate": 2.5427378604848285e-06, + "loss": 0.3579, + "step": 4803 + }, + { + "epoch": 0.6737727910238429, + "grad_norm": 1.7256024483461303, + "learning_rate": 2.5407600726234356e-06, + "loss": 0.333, + "step": 4804 + }, + { + "epoch": 0.6739130434782609, + "grad_norm": 1.8363989067596194, + "learning_rate": 2.538782792209019e-06, + "loss": 0.3468, + "step": 4805 + }, + { + "epoch": 0.6740532959326788, + "grad_norm": 2.321343292514414, + "learning_rate": 2.5368060196495785e-06, + "loss": 0.3451, + "step": 4806 + }, + { + "epoch": 0.6741935483870968, + "grad_norm": 1.6191084505388524, + "learning_rate": 2.5348297553530064e-06, + "loss": 0.2886, + "step": 4807 + }, + { + "epoch": 0.6743338008415147, + "grad_norm": 1.8480164255469123, + "learning_rate": 2.5328539997270927e-06, + "loss": 0.3027, + "step": 4808 + }, + { + "epoch": 0.6744740532959327, + "grad_norm": 1.9105289150307192, + "learning_rate": 2.5308787531795186e-06, + "loss": 0.3497, + "step": 4809 + }, + { + "epoch": 0.6746143057503506, + "grad_norm": 2.1354458938707426, + "learning_rate": 2.5289040161178623e-06, + "loss": 0.3392, + "step": 4810 + }, + { + "epoch": 0.6747545582047686, + "grad_norm": 1.6922082563344254, + "learning_rate": 2.526929788949598e-06, + "loss": 0.3395, + "step": 4811 + }, + { + "epoch": 0.6748948106591866, + "grad_norm": 2.5258509193579277, + "learning_rate": 2.524956072082093e-06, + "loss": 0.4006, + "step": 4812 + }, + { + "epoch": 0.6750350631136045, + "grad_norm": 2.5034864909952486, + "learning_rate": 2.5229828659226114e-06, + "loss": 0.3507, + "step": 4813 + }, + { + "epoch": 0.6751753155680225, + "grad_norm": 3.8638668715721702, + "learning_rate": 2.521010170878311e-06, + "loss": 0.3521, + "step": 4814 + }, + { + "epoch": 0.6753155680224404, + "grad_norm": 2.4862366425014915, + "learning_rate": 2.5190379873562402e-06, + "loss": 0.3639, + "step": 4815 + }, + { + "epoch": 0.6754558204768584, + "grad_norm": 2.6345516991776132, + "learning_rate": 2.517066315763348e-06, + "loss": 0.3882, + "step": 4816 + }, + { + "epoch": 0.6755960729312763, + "grad_norm": 2.0635241653288183, + "learning_rate": 2.5150951565064737e-06, + "loss": 0.3373, + "step": 4817 + }, + { + "epoch": 0.6757363253856943, + "grad_norm": 2.216093272666554, + "learning_rate": 2.513124509992353e-06, + "loss": 0.3393, + "step": 4818 + }, + { + "epoch": 0.6758765778401122, + "grad_norm": 3.0399091860106435, + "learning_rate": 2.511154376627615e-06, + "loss": 0.356, + "step": 4819 + }, + { + "epoch": 0.6760168302945302, + "grad_norm": 3.8105693994400913, + "learning_rate": 2.5091847568187834e-06, + "loss": 0.3533, + "step": 4820 + }, + { + "epoch": 0.6761570827489481, + "grad_norm": 2.979912742258916, + "learning_rate": 2.5072156509722745e-06, + "loss": 0.3542, + "step": 4821 + }, + { + "epoch": 0.6762973352033661, + "grad_norm": 1.9496235704220481, + "learning_rate": 2.5052470594944e-06, + "loss": 0.3708, + "step": 4822 + }, + { + "epoch": 0.6764375876577841, + "grad_norm": 1.9776677596046714, + "learning_rate": 2.5032789827913672e-06, + "loss": 0.3302, + "step": 4823 + }, + { + "epoch": 0.676577840112202, + "grad_norm": 1.8552918038572037, + "learning_rate": 2.5013114212692713e-06, + "loss": 0.3384, + "step": 4824 + }, + { + "epoch": 0.67671809256662, + "grad_norm": 3.066828724131287, + "learning_rate": 2.499344375334106e-06, + "loss": 0.3313, + "step": 4825 + }, + { + "epoch": 0.6768583450210379, + "grad_norm": 2.0423257373817774, + "learning_rate": 2.4973778453917574e-06, + "loss": 0.3407, + "step": 4826 + }, + { + "epoch": 0.6769985974754558, + "grad_norm": 1.9275461895426618, + "learning_rate": 2.4954118318480063e-06, + "loss": 0.3369, + "step": 4827 + }, + { + "epoch": 0.6771388499298737, + "grad_norm": 1.6136146921272896, + "learning_rate": 2.4934463351085254e-06, + "loss": 0.3356, + "step": 4828 + }, + { + "epoch": 0.6772791023842917, + "grad_norm": 2.023826532265707, + "learning_rate": 2.4914813555788827e-06, + "loss": 0.3857, + "step": 4829 + }, + { + "epoch": 0.6774193548387096, + "grad_norm": 2.6130023345586464, + "learning_rate": 2.489516893664535e-06, + "loss": 0.333, + "step": 4830 + }, + { + "epoch": 0.6775596072931276, + "grad_norm": 1.9802990601805766, + "learning_rate": 2.4875529497708356e-06, + "loss": 0.3525, + "step": 4831 + }, + { + "epoch": 0.6776998597475455, + "grad_norm": 1.7728034411030296, + "learning_rate": 2.4855895243030325e-06, + "loss": 0.3657, + "step": 4832 + }, + { + "epoch": 0.6778401122019635, + "grad_norm": 2.1376590647266593, + "learning_rate": 2.483626617666264e-06, + "loss": 0.3367, + "step": 4833 + }, + { + "epoch": 0.6779803646563815, + "grad_norm": 1.9989144240091217, + "learning_rate": 2.4816642302655634e-06, + "loss": 0.3564, + "step": 4834 + }, + { + "epoch": 0.6781206171107994, + "grad_norm": 1.6249887025880574, + "learning_rate": 2.479702362505853e-06, + "loss": 0.3611, + "step": 4835 + }, + { + "epoch": 0.6782608695652174, + "grad_norm": 3.828242459733366, + "learning_rate": 2.4777410147919516e-06, + "loss": 0.3107, + "step": 4836 + }, + { + "epoch": 0.6784011220196353, + "grad_norm": 1.7606868724628917, + "learning_rate": 2.4757801875285705e-06, + "loss": 0.3754, + "step": 4837 + }, + { + "epoch": 0.6785413744740533, + "grad_norm": 1.6891278080484637, + "learning_rate": 2.4738198811203112e-06, + "loss": 0.3604, + "step": 4838 + }, + { + "epoch": 0.6786816269284712, + "grad_norm": 1.6802695042290212, + "learning_rate": 2.471860095971671e-06, + "loss": 0.3568, + "step": 4839 + }, + { + "epoch": 0.6788218793828892, + "grad_norm": 2.112436404202052, + "learning_rate": 2.4699008324870366e-06, + "loss": 0.3469, + "step": 4840 + }, + { + "epoch": 0.6789621318373071, + "grad_norm": 2.258230731599385, + "learning_rate": 2.4679420910706887e-06, + "loss": 0.3676, + "step": 4841 + }, + { + "epoch": 0.6791023842917251, + "grad_norm": 1.7790280588134102, + "learning_rate": 2.4659838721268005e-06, + "loss": 0.2956, + "step": 4842 + }, + { + "epoch": 0.679242636746143, + "grad_norm": 8.145004295315204, + "learning_rate": 2.4640261760594377e-06, + "loss": 0.3464, + "step": 4843 + }, + { + "epoch": 0.679382889200561, + "grad_norm": 4.064096929220273, + "learning_rate": 2.4620690032725536e-06, + "loss": 0.3414, + "step": 4844 + }, + { + "epoch": 0.679523141654979, + "grad_norm": 2.1768670103905094, + "learning_rate": 2.4601123541699996e-06, + "loss": 0.3786, + "step": 4845 + }, + { + "epoch": 0.6796633941093969, + "grad_norm": 2.837693188514416, + "learning_rate": 2.458156229155516e-06, + "loss": 0.3681, + "step": 4846 + }, + { + "epoch": 0.6798036465638149, + "grad_norm": 2.0115092048026675, + "learning_rate": 2.456200628632736e-06, + "loss": 0.3524, + "step": 4847 + }, + { + "epoch": 0.6799438990182328, + "grad_norm": 1.803622505403581, + "learning_rate": 2.454245553005184e-06, + "loss": 0.3421, + "step": 4848 + }, + { + "epoch": 0.6800841514726508, + "grad_norm": 1.9715777479916263, + "learning_rate": 2.452291002676278e-06, + "loss": 0.3438, + "step": 4849 + }, + { + "epoch": 0.6802244039270687, + "grad_norm": 2.4935131792324867, + "learning_rate": 2.450336978049322e-06, + "loss": 0.3397, + "step": 4850 + }, + { + "epoch": 0.6803646563814867, + "grad_norm": 1.737553476292829, + "learning_rate": 2.448383479527517e-06, + "loss": 0.3432, + "step": 4851 + }, + { + "epoch": 0.6805049088359046, + "grad_norm": 1.8921600289402105, + "learning_rate": 2.446430507513954e-06, + "loss": 0.3505, + "step": 4852 + }, + { + "epoch": 0.6806451612903226, + "grad_norm": 2.602933871612999, + "learning_rate": 2.4444780624116147e-06, + "loss": 0.3539, + "step": 4853 + }, + { + "epoch": 0.6807854137447406, + "grad_norm": 1.524525415335669, + "learning_rate": 2.4425261446233738e-06, + "loss": 0.3193, + "step": 4854 + }, + { + "epoch": 0.6809256661991585, + "grad_norm": 1.9725900166789263, + "learning_rate": 2.4405747545519966e-06, + "loss": 0.3038, + "step": 4855 + }, + { + "epoch": 0.6810659186535765, + "grad_norm": 3.4232317181456127, + "learning_rate": 2.4386238926001352e-06, + "loss": 0.369, + "step": 4856 + }, + { + "epoch": 0.6812061711079944, + "grad_norm": 2.1701234849171183, + "learning_rate": 2.436673559170339e-06, + "loss": 0.3556, + "step": 4857 + }, + { + "epoch": 0.6813464235624124, + "grad_norm": 1.9295519576558962, + "learning_rate": 2.4347237546650443e-06, + "loss": 0.3173, + "step": 4858 + }, + { + "epoch": 0.6814866760168303, + "grad_norm": 1.9591650380341858, + "learning_rate": 2.4327744794865803e-06, + "loss": 0.3185, + "step": 4859 + }, + { + "epoch": 0.6816269284712483, + "grad_norm": 1.7390676027070593, + "learning_rate": 2.430825734037167e-06, + "loss": 0.315, + "step": 4860 + }, + { + "epoch": 0.6817671809256662, + "grad_norm": 3.183563344611323, + "learning_rate": 2.4288775187189134e-06, + "loss": 0.3528, + "step": 4861 + }, + { + "epoch": 0.6819074333800842, + "grad_norm": 1.8374877877381035, + "learning_rate": 2.4269298339338205e-06, + "loss": 0.3497, + "step": 4862 + }, + { + "epoch": 0.6820476858345021, + "grad_norm": 1.823141494692052, + "learning_rate": 2.42498268008378e-06, + "loss": 0.335, + "step": 4863 + }, + { + "epoch": 0.6821879382889201, + "grad_norm": 2.5986848329242163, + "learning_rate": 2.4230360575705743e-06, + "loss": 0.3326, + "step": 4864 + }, + { + "epoch": 0.6823281907433381, + "grad_norm": 2.1540750785359655, + "learning_rate": 2.421089966795873e-06, + "loss": 0.3142, + "step": 4865 + }, + { + "epoch": 0.682468443197756, + "grad_norm": 1.8623304840365607, + "learning_rate": 2.4191444081612382e-06, + "loss": 0.3304, + "step": 4866 + }, + { + "epoch": 0.6826086956521739, + "grad_norm": 2.2993695473094347, + "learning_rate": 2.417199382068124e-06, + "loss": 0.3736, + "step": 4867 + }, + { + "epoch": 0.6827489481065918, + "grad_norm": 1.9145809794824167, + "learning_rate": 2.4152548889178722e-06, + "loss": 0.3813, + "step": 4868 + }, + { + "epoch": 0.6828892005610098, + "grad_norm": 1.5904168822831621, + "learning_rate": 2.4133109291117156e-06, + "loss": 0.3336, + "step": 4869 + }, + { + "epoch": 0.6830294530154277, + "grad_norm": 1.565604140091316, + "learning_rate": 2.4113675030507786e-06, + "loss": 0.3482, + "step": 4870 + }, + { + "epoch": 0.6831697054698457, + "grad_norm": 1.4806693150082852, + "learning_rate": 2.40942461113607e-06, + "loss": 0.3241, + "step": 4871 + }, + { + "epoch": 0.6833099579242636, + "grad_norm": 1.7401013393049916, + "learning_rate": 2.4074822537684945e-06, + "loss": 0.3274, + "step": 4872 + }, + { + "epoch": 0.6834502103786816, + "grad_norm": 2.2783320328941996, + "learning_rate": 2.4055404313488424e-06, + "loss": 0.3537, + "step": 4873 + }, + { + "epoch": 0.6835904628330995, + "grad_norm": 2.029054129715597, + "learning_rate": 2.4035991442777963e-06, + "loss": 0.3373, + "step": 4874 + }, + { + "epoch": 0.6837307152875175, + "grad_norm": 2.043027216875199, + "learning_rate": 2.401658392955928e-06, + "loss": 0.3117, + "step": 4875 + }, + { + "epoch": 0.6838709677419355, + "grad_norm": 1.8483148090728783, + "learning_rate": 2.3997181777836955e-06, + "loss": 0.2996, + "step": 4876 + }, + { + "epoch": 0.6840112201963534, + "grad_norm": 1.9384667230543733, + "learning_rate": 2.39777849916145e-06, + "loss": 0.352, + "step": 4877 + }, + { + "epoch": 0.6841514726507714, + "grad_norm": 1.869122639632318, + "learning_rate": 2.395839357489431e-06, + "loss": 0.3871, + "step": 4878 + }, + { + "epoch": 0.6842917251051893, + "grad_norm": 1.8982037058851864, + "learning_rate": 2.3939007531677656e-06, + "loss": 0.3518, + "step": 4879 + }, + { + "epoch": 0.6844319775596073, + "grad_norm": 1.6800525357342693, + "learning_rate": 2.391962686596473e-06, + "loss": 0.3167, + "step": 4880 + }, + { + "epoch": 0.6845722300140252, + "grad_norm": 1.574222536295462, + "learning_rate": 2.390025158175458e-06, + "loss": 0.3311, + "step": 4881 + }, + { + "epoch": 0.6847124824684432, + "grad_norm": 1.8750896432428326, + "learning_rate": 2.3880881683045176e-06, + "loss": 0.3711, + "step": 4882 + }, + { + "epoch": 0.6848527349228611, + "grad_norm": 2.0688174049483172, + "learning_rate": 2.3861517173833347e-06, + "loss": 0.3775, + "step": 4883 + }, + { + "epoch": 0.6849929873772791, + "grad_norm": 2.175492053442408, + "learning_rate": 2.3842158058114855e-06, + "loss": 0.346, + "step": 4884 + }, + { + "epoch": 0.685133239831697, + "grad_norm": 1.4958595510893002, + "learning_rate": 2.3822804339884283e-06, + "loss": 0.3717, + "step": 4885 + }, + { + "epoch": 0.685273492286115, + "grad_norm": 2.0031945226926537, + "learning_rate": 2.3803456023135135e-06, + "loss": 0.3666, + "step": 4886 + }, + { + "epoch": 0.685413744740533, + "grad_norm": 1.765770965214067, + "learning_rate": 2.3784113111859818e-06, + "loss": 0.3755, + "step": 4887 + }, + { + "epoch": 0.6855539971949509, + "grad_norm": 1.7711917589259518, + "learning_rate": 2.37647756100496e-06, + "loss": 0.3461, + "step": 4888 + }, + { + "epoch": 0.6856942496493689, + "grad_norm": 2.6354565121097484, + "learning_rate": 2.3745443521694644e-06, + "loss": 0.3195, + "step": 4889 + }, + { + "epoch": 0.6858345021037868, + "grad_norm": 2.8539677757149495, + "learning_rate": 2.3726116850783987e-06, + "loss": 0.3525, + "step": 4890 + }, + { + "epoch": 0.6859747545582048, + "grad_norm": 1.7553092255917502, + "learning_rate": 2.370679560130557e-06, + "loss": 0.3057, + "step": 4891 + }, + { + "epoch": 0.6861150070126227, + "grad_norm": 2.0170381308995338, + "learning_rate": 2.3687479777246165e-06, + "loss": 0.3685, + "step": 4892 + }, + { + "epoch": 0.6862552594670407, + "grad_norm": 1.9162794081290784, + "learning_rate": 2.366816938259148e-06, + "loss": 0.3561, + "step": 4893 + }, + { + "epoch": 0.6863955119214586, + "grad_norm": 2.0485275237134055, + "learning_rate": 2.364886442132606e-06, + "loss": 0.3203, + "step": 4894 + }, + { + "epoch": 0.6865357643758766, + "grad_norm": 3.18272640840319, + "learning_rate": 2.3629564897433376e-06, + "loss": 0.3668, + "step": 4895 + }, + { + "epoch": 0.6866760168302946, + "grad_norm": 2.0510066607891155, + "learning_rate": 2.361027081489575e-06, + "loss": 0.3449, + "step": 4896 + }, + { + "epoch": 0.6868162692847125, + "grad_norm": 2.3276141329438356, + "learning_rate": 2.3590982177694348e-06, + "loss": 0.332, + "step": 4897 + }, + { + "epoch": 0.6869565217391305, + "grad_norm": 2.7521228399528583, + "learning_rate": 2.357169898980927e-06, + "loss": 0.3469, + "step": 4898 + }, + { + "epoch": 0.6870967741935484, + "grad_norm": 2.2696359843316927, + "learning_rate": 2.3552421255219465e-06, + "loss": 0.2938, + "step": 4899 + }, + { + "epoch": 0.6872370266479664, + "grad_norm": 2.272610438551627, + "learning_rate": 2.3533148977902755e-06, + "loss": 0.2979, + "step": 4900 + }, + { + "epoch": 0.6873772791023843, + "grad_norm": 1.659763639812041, + "learning_rate": 2.3513882161835835e-06, + "loss": 0.3288, + "step": 4901 + }, + { + "epoch": 0.6875175315568023, + "grad_norm": 1.682054351350338, + "learning_rate": 2.349462081099429e-06, + "loss": 0.3532, + "step": 4902 + }, + { + "epoch": 0.6876577840112202, + "grad_norm": 2.0667674680963497, + "learning_rate": 2.3475364929352554e-06, + "loss": 0.3756, + "step": 4903 + }, + { + "epoch": 0.6877980364656382, + "grad_norm": 2.3794944800120286, + "learning_rate": 2.3456114520883956e-06, + "loss": 0.3078, + "step": 4904 + }, + { + "epoch": 0.6879382889200562, + "grad_norm": 2.102733073339568, + "learning_rate": 2.343686958956069e-06, + "loss": 0.372, + "step": 4905 + }, + { + "epoch": 0.6880785413744741, + "grad_norm": 1.8473964774689269, + "learning_rate": 2.3417630139353782e-06, + "loss": 0.3488, + "step": 4906 + }, + { + "epoch": 0.688218793828892, + "grad_norm": 1.811526859413884, + "learning_rate": 2.339839617423318e-06, + "loss": 0.3572, + "step": 4907 + }, + { + "epoch": 0.6883590462833099, + "grad_norm": 1.9006112922292053, + "learning_rate": 2.3379167698167666e-06, + "loss": 0.3284, + "step": 4908 + }, + { + "epoch": 0.6884992987377279, + "grad_norm": 6.836867403269759, + "learning_rate": 2.3359944715124915e-06, + "loss": 0.3425, + "step": 4909 + }, + { + "epoch": 0.6886395511921458, + "grad_norm": 1.930044411459565, + "learning_rate": 2.3340727229071445e-06, + "loss": 0.3266, + "step": 4910 + }, + { + "epoch": 0.6887798036465638, + "grad_norm": 1.8514138978968704, + "learning_rate": 2.3321515243972663e-06, + "loss": 0.3412, + "step": 4911 + }, + { + "epoch": 0.6889200561009817, + "grad_norm": 1.923306064158969, + "learning_rate": 2.330230876379283e-06, + "loss": 0.3572, + "step": 4912 + }, + { + "epoch": 0.6890603085553997, + "grad_norm": 1.7974390222767227, + "learning_rate": 2.3283107792495046e-06, + "loss": 0.3788, + "step": 4913 + }, + { + "epoch": 0.6892005610098176, + "grad_norm": 2.0320310998173725, + "learning_rate": 2.326391233404131e-06, + "loss": 0.3269, + "step": 4914 + }, + { + "epoch": 0.6893408134642356, + "grad_norm": 1.9156506858709157, + "learning_rate": 2.3244722392392467e-06, + "loss": 0.3512, + "step": 4915 + }, + { + "epoch": 0.6894810659186535, + "grad_norm": 6.679958476320143, + "learning_rate": 2.322553797150825e-06, + "loss": 0.3371, + "step": 4916 + }, + { + "epoch": 0.6896213183730715, + "grad_norm": 2.275004472832946, + "learning_rate": 2.3206359075347194e-06, + "loss": 0.3048, + "step": 4917 + }, + { + "epoch": 0.6897615708274895, + "grad_norm": 2.4697752163801874, + "learning_rate": 2.318718570786675e-06, + "loss": 0.3442, + "step": 4918 + }, + { + "epoch": 0.6899018232819074, + "grad_norm": 1.95949498346129, + "learning_rate": 2.3168017873023203e-06, + "loss": 0.3365, + "step": 4919 + }, + { + "epoch": 0.6900420757363254, + "grad_norm": 1.561890368742812, + "learning_rate": 2.3148855574771706e-06, + "loss": 0.3206, + "step": 4920 + }, + { + "epoch": 0.6901823281907433, + "grad_norm": 1.9940354098100093, + "learning_rate": 2.3129698817066267e-06, + "loss": 0.3189, + "step": 4921 + }, + { + "epoch": 0.6903225806451613, + "grad_norm": 2.8386582283562336, + "learning_rate": 2.311054760385974e-06, + "loss": 0.3585, + "step": 4922 + }, + { + "epoch": 0.6904628330995792, + "grad_norm": 1.6717226903826001, + "learning_rate": 2.309140193910385e-06, + "loss": 0.3252, + "step": 4923 + }, + { + "epoch": 0.6906030855539972, + "grad_norm": 1.9455905087529684, + "learning_rate": 2.307226182674918e-06, + "loss": 0.3729, + "step": 4924 + }, + { + "epoch": 0.6907433380084151, + "grad_norm": 2.100293992671468, + "learning_rate": 2.3053127270745163e-06, + "loss": 0.3224, + "step": 4925 + }, + { + "epoch": 0.6908835904628331, + "grad_norm": 2.144882757010462, + "learning_rate": 2.3033998275040047e-06, + "loss": 0.3883, + "step": 4926 + }, + { + "epoch": 0.691023842917251, + "grad_norm": 1.8407416288928191, + "learning_rate": 2.301487484358099e-06, + "loss": 0.3537, + "step": 4927 + }, + { + "epoch": 0.691164095371669, + "grad_norm": 1.8411367614896692, + "learning_rate": 2.2995756980313984e-06, + "loss": 0.3351, + "step": 4928 + }, + { + "epoch": 0.691304347826087, + "grad_norm": 4.384000838091693, + "learning_rate": 2.2976644689183848e-06, + "loss": 0.3624, + "step": 4929 + }, + { + "epoch": 0.6914446002805049, + "grad_norm": 2.053349164224674, + "learning_rate": 2.295753797413428e-06, + "loss": 0.3051, + "step": 4930 + }, + { + "epoch": 0.6915848527349229, + "grad_norm": 1.8416763541310615, + "learning_rate": 2.2938436839107825e-06, + "loss": 0.314, + "step": 4931 + }, + { + "epoch": 0.6917251051893408, + "grad_norm": 2.9839517183360793, + "learning_rate": 2.2919341288045853e-06, + "loss": 0.3345, + "step": 4932 + }, + { + "epoch": 0.6918653576437588, + "grad_norm": 2.4474906604810753, + "learning_rate": 2.2900251324888627e-06, + "loss": 0.3304, + "step": 4933 + }, + { + "epoch": 0.6920056100981767, + "grad_norm": 2.0332256392845682, + "learning_rate": 2.288116695357519e-06, + "loss": 0.3946, + "step": 4934 + }, + { + "epoch": 0.6921458625525947, + "grad_norm": 2.425418550217218, + "learning_rate": 2.2862088178043483e-06, + "loss": 0.3636, + "step": 4935 + }, + { + "epoch": 0.6922861150070126, + "grad_norm": 2.319435133569523, + "learning_rate": 2.2843015002230283e-06, + "loss": 0.3636, + "step": 4936 + }, + { + "epoch": 0.6924263674614306, + "grad_norm": 1.695037397574108, + "learning_rate": 2.282394743007122e-06, + "loss": 0.3128, + "step": 4937 + }, + { + "epoch": 0.6925666199158486, + "grad_norm": 1.674633139674579, + "learning_rate": 2.280488546550072e-06, + "loss": 0.3229, + "step": 4938 + }, + { + "epoch": 0.6927068723702665, + "grad_norm": 2.4116542618473575, + "learning_rate": 2.27858291124521e-06, + "loss": 0.3729, + "step": 4939 + }, + { + "epoch": 0.6928471248246845, + "grad_norm": 1.710757938525467, + "learning_rate": 2.276677837485752e-06, + "loss": 0.3182, + "step": 4940 + }, + { + "epoch": 0.6929873772791024, + "grad_norm": 2.1489788691146137, + "learning_rate": 2.2747733256647946e-06, + "loss": 0.3468, + "step": 4941 + }, + { + "epoch": 0.6931276297335204, + "grad_norm": 1.7799265445597243, + "learning_rate": 2.2728693761753216e-06, + "loss": 0.3844, + "step": 4942 + }, + { + "epoch": 0.6932678821879383, + "grad_norm": 1.915242419224993, + "learning_rate": 2.2709659894102e-06, + "loss": 0.3649, + "step": 4943 + }, + { + "epoch": 0.6934081346423563, + "grad_norm": 1.7433303027129712, + "learning_rate": 2.26906316576218e-06, + "loss": 0.339, + "step": 4944 + }, + { + "epoch": 0.6935483870967742, + "grad_norm": 2.1112188122141378, + "learning_rate": 2.2671609056238953e-06, + "loss": 0.3906, + "step": 4945 + }, + { + "epoch": 0.6936886395511922, + "grad_norm": 2.231766785685112, + "learning_rate": 2.265259209387867e-06, + "loss": 0.3461, + "step": 4946 + }, + { + "epoch": 0.69382889200561, + "grad_norm": 1.5217532506725975, + "learning_rate": 2.263358077446492e-06, + "loss": 0.3758, + "step": 4947 + }, + { + "epoch": 0.693969144460028, + "grad_norm": 2.2807142072664544, + "learning_rate": 2.2614575101920585e-06, + "loss": 0.3365, + "step": 4948 + }, + { + "epoch": 0.694109396914446, + "grad_norm": 1.737891884389821, + "learning_rate": 2.2595575080167348e-06, + "loss": 0.3469, + "step": 4949 + }, + { + "epoch": 0.6942496493688639, + "grad_norm": 1.6898442692079068, + "learning_rate": 2.257658071312573e-06, + "loss": 0.3348, + "step": 4950 + }, + { + "epoch": 0.6943899018232819, + "grad_norm": 3.685137234206689, + "learning_rate": 2.2557592004715084e-06, + "loss": 0.3366, + "step": 4951 + }, + { + "epoch": 0.6945301542776998, + "grad_norm": 2.268858923280868, + "learning_rate": 2.25386089588536e-06, + "loss": 0.3405, + "step": 4952 + }, + { + "epoch": 0.6946704067321178, + "grad_norm": 4.717015231747, + "learning_rate": 2.25196315794583e-06, + "loss": 0.3553, + "step": 4953 + }, + { + "epoch": 0.6948106591865357, + "grad_norm": 1.750220625457748, + "learning_rate": 2.250065987044505e-06, + "loss": 0.3487, + "step": 4954 + }, + { + "epoch": 0.6949509116409537, + "grad_norm": 1.773653462574046, + "learning_rate": 2.248169383572849e-06, + "loss": 0.3293, + "step": 4955 + }, + { + "epoch": 0.6950911640953716, + "grad_norm": 1.7211895362356864, + "learning_rate": 2.2462733479222147e-06, + "loss": 0.352, + "step": 4956 + }, + { + "epoch": 0.6952314165497896, + "grad_norm": 1.991094611092209, + "learning_rate": 2.244377880483838e-06, + "loss": 0.3665, + "step": 4957 + }, + { + "epoch": 0.6953716690042075, + "grad_norm": 1.7630594569548512, + "learning_rate": 2.242482981648831e-06, + "loss": 0.3318, + "step": 4958 + }, + { + "epoch": 0.6955119214586255, + "grad_norm": 1.8962335548366729, + "learning_rate": 2.2405886518081967e-06, + "loss": 0.2876, + "step": 4959 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 1.7127470436944867, + "learning_rate": 2.238694891352814e-06, + "loss": 0.384, + "step": 4960 + }, + { + "epoch": 0.6957924263674614, + "grad_norm": 1.8407797730532787, + "learning_rate": 2.236801700673449e-06, + "loss": 0.3004, + "step": 4961 + }, + { + "epoch": 0.6959326788218794, + "grad_norm": 1.6073013575672346, + "learning_rate": 2.2349090801607477e-06, + "loss": 0.307, + "step": 4962 + }, + { + "epoch": 0.6960729312762973, + "grad_norm": 2.234191246746476, + "learning_rate": 2.233017030205239e-06, + "loss": 0.3429, + "step": 4963 + }, + { + "epoch": 0.6962131837307153, + "grad_norm": 2.1695388194890683, + "learning_rate": 2.2311255511973347e-06, + "loss": 0.3327, + "step": 4964 + }, + { + "epoch": 0.6963534361851332, + "grad_norm": 2.319662086443192, + "learning_rate": 2.2292346435273277e-06, + "loss": 0.3384, + "step": 4965 + }, + { + "epoch": 0.6964936886395512, + "grad_norm": 2.435123948344073, + "learning_rate": 2.227344307585396e-06, + "loss": 0.3099, + "step": 4966 + }, + { + "epoch": 0.6966339410939691, + "grad_norm": 1.9782460397145347, + "learning_rate": 2.2254545437615932e-06, + "loss": 0.354, + "step": 4967 + }, + { + "epoch": 0.6967741935483871, + "grad_norm": 1.855506077441144, + "learning_rate": 2.223565352445861e-06, + "loss": 0.3603, + "step": 4968 + }, + { + "epoch": 0.696914446002805, + "grad_norm": 2.5492210582239876, + "learning_rate": 2.2216767340280206e-06, + "loss": 0.335, + "step": 4969 + }, + { + "epoch": 0.697054698457223, + "grad_norm": 4.32902060376448, + "learning_rate": 2.219788688897775e-06, + "loss": 0.3546, + "step": 4970 + }, + { + "epoch": 0.697194950911641, + "grad_norm": 1.9668548181478436, + "learning_rate": 2.2179012174447097e-06, + "loss": 0.3423, + "step": 4971 + }, + { + "epoch": 0.6973352033660589, + "grad_norm": 1.9644566300581427, + "learning_rate": 2.2160143200582906e-06, + "loss": 0.3427, + "step": 4972 + }, + { + "epoch": 0.6974754558204769, + "grad_norm": 2.3470263151717936, + "learning_rate": 2.2141279971278663e-06, + "loss": 0.295, + "step": 4973 + }, + { + "epoch": 0.6976157082748948, + "grad_norm": 2.1396924514456934, + "learning_rate": 2.2122422490426676e-06, + "loss": 0.3017, + "step": 4974 + }, + { + "epoch": 0.6977559607293128, + "grad_norm": 2.338386502176925, + "learning_rate": 2.2103570761918023e-06, + "loss": 0.3375, + "step": 4975 + }, + { + "epoch": 0.6978962131837307, + "grad_norm": 1.5508060020950503, + "learning_rate": 2.208472478964265e-06, + "loss": 0.3048, + "step": 4976 + }, + { + "epoch": 0.6980364656381487, + "grad_norm": 1.6475311969819226, + "learning_rate": 2.2065884577489276e-06, + "loss": 0.3641, + "step": 4977 + }, + { + "epoch": 0.6981767180925667, + "grad_norm": 1.866884917318121, + "learning_rate": 2.2047050129345478e-06, + "loss": 0.3832, + "step": 4978 + }, + { + "epoch": 0.6983169705469846, + "grad_norm": 2.3844679788082015, + "learning_rate": 2.202822144909757e-06, + "loss": 0.3286, + "step": 4979 + }, + { + "epoch": 0.6984572230014026, + "grad_norm": 1.8905640269133994, + "learning_rate": 2.2009398540630742e-06, + "loss": 0.3831, + "step": 4980 + }, + { + "epoch": 0.6985974754558205, + "grad_norm": 2.1308290530109444, + "learning_rate": 2.199058140782897e-06, + "loss": 0.3591, + "step": 4981 + }, + { + "epoch": 0.6987377279102385, + "grad_norm": 1.8859487032300772, + "learning_rate": 2.197177005457503e-06, + "loss": 0.3507, + "step": 4982 + }, + { + "epoch": 0.6988779803646564, + "grad_norm": 2.2174662785078785, + "learning_rate": 2.1952964484750527e-06, + "loss": 0.3817, + "step": 4983 + }, + { + "epoch": 0.6990182328190744, + "grad_norm": 1.8149222269550918, + "learning_rate": 2.1934164702235844e-06, + "loss": 0.3741, + "step": 4984 + }, + { + "epoch": 0.6991584852734923, + "grad_norm": 1.7668686934184172, + "learning_rate": 2.1915370710910188e-06, + "loss": 0.3567, + "step": 4985 + }, + { + "epoch": 0.6992987377279103, + "grad_norm": 1.94357320097987, + "learning_rate": 2.1896582514651577e-06, + "loss": 0.3166, + "step": 4986 + }, + { + "epoch": 0.6994389901823281, + "grad_norm": 2.4325619097493045, + "learning_rate": 2.1877800117336835e-06, + "loss": 0.3491, + "step": 4987 + }, + { + "epoch": 0.6995792426367461, + "grad_norm": 3.0002342768427908, + "learning_rate": 2.1859023522841543e-06, + "loss": 0.3291, + "step": 4988 + }, + { + "epoch": 0.699719495091164, + "grad_norm": 2.2086329226938166, + "learning_rate": 2.184025273504014e-06, + "loss": 0.3632, + "step": 4989 + }, + { + "epoch": 0.699859747545582, + "grad_norm": 1.8811335378698089, + "learning_rate": 2.1821487757805843e-06, + "loss": 0.3229, + "step": 4990 + }, + { + "epoch": 0.7, + "grad_norm": 2.146365744702934, + "learning_rate": 2.180272859501068e-06, + "loss": 0.3457, + "step": 4991 + }, + { + "epoch": 0.7001402524544179, + "grad_norm": 2.0006237249540146, + "learning_rate": 2.178397525052546e-06, + "loss": 0.3432, + "step": 4992 + }, + { + "epoch": 0.7002805049088359, + "grad_norm": 1.9857248836246286, + "learning_rate": 2.176522772821983e-06, + "loss": 0.3176, + "step": 4993 + }, + { + "epoch": 0.7004207573632538, + "grad_norm": 1.8293669304964608, + "learning_rate": 2.1746486031962183e-06, + "loss": 0.3725, + "step": 4994 + }, + { + "epoch": 0.7005610098176718, + "grad_norm": 1.9802026212457033, + "learning_rate": 2.172775016561977e-06, + "loss": 0.349, + "step": 4995 + }, + { + "epoch": 0.7007012622720897, + "grad_norm": 1.8544252051078014, + "learning_rate": 2.1709020133058566e-06, + "loss": 0.3084, + "step": 4996 + }, + { + "epoch": 0.7008415147265077, + "grad_norm": 1.720086566387843, + "learning_rate": 2.16902959381434e-06, + "loss": 0.3581, + "step": 4997 + }, + { + "epoch": 0.7009817671809256, + "grad_norm": 4.118131757306003, + "learning_rate": 2.16715775847379e-06, + "loss": 0.3129, + "step": 4998 + }, + { + "epoch": 0.7011220196353436, + "grad_norm": 3.0040660959482373, + "learning_rate": 2.1652865076704432e-06, + "loss": 0.3419, + "step": 4999 + }, + { + "epoch": 0.7012622720897616, + "grad_norm": 2.307826707056283, + "learning_rate": 2.16341584179042e-06, + "loss": 0.3435, + "step": 5000 + }, + { + "epoch": 0.7014025245441795, + "grad_norm": 1.8347030060453506, + "learning_rate": 2.1615457612197206e-06, + "loss": 0.3742, + "step": 5001 + }, + { + "epoch": 0.7015427769985975, + "grad_norm": 1.8641945062342542, + "learning_rate": 2.159676266344222e-06, + "loss": 0.361, + "step": 5002 + }, + { + "epoch": 0.7016830294530154, + "grad_norm": 1.6426599869984482, + "learning_rate": 2.1578073575496814e-06, + "loss": 0.3136, + "step": 5003 + }, + { + "epoch": 0.7018232819074334, + "grad_norm": 1.7516965816470764, + "learning_rate": 2.1559390352217357e-06, + "loss": 0.3369, + "step": 5004 + }, + { + "epoch": 0.7019635343618513, + "grad_norm": 1.9462112740428954, + "learning_rate": 2.1540712997459e-06, + "loss": 0.3154, + "step": 5005 + }, + { + "epoch": 0.7021037868162693, + "grad_norm": 3.2220678913675966, + "learning_rate": 2.1522041515075686e-06, + "loss": 0.3223, + "step": 5006 + }, + { + "epoch": 0.7022440392706872, + "grad_norm": 1.9223240273527498, + "learning_rate": 2.150337590892016e-06, + "loss": 0.3785, + "step": 5007 + }, + { + "epoch": 0.7023842917251052, + "grad_norm": 1.86673581074765, + "learning_rate": 2.14847161828439e-06, + "loss": 0.3279, + "step": 5008 + }, + { + "epoch": 0.7025245441795231, + "grad_norm": 2.3034515687634225, + "learning_rate": 2.1466062340697234e-06, + "loss": 0.3406, + "step": 5009 + }, + { + "epoch": 0.7026647966339411, + "grad_norm": 3.33387553255807, + "learning_rate": 2.144741438632925e-06, + "loss": 0.325, + "step": 5010 + }, + { + "epoch": 0.7028050490883591, + "grad_norm": 2.2296706607345795, + "learning_rate": 2.1428772323587827e-06, + "loss": 0.3782, + "step": 5011 + }, + { + "epoch": 0.702945301542777, + "grad_norm": 1.8868797344136996, + "learning_rate": 2.141013615631962e-06, + "loss": 0.4091, + "step": 5012 + }, + { + "epoch": 0.703085553997195, + "grad_norm": 1.5316438752236938, + "learning_rate": 2.1391505888370067e-06, + "loss": 0.3135, + "step": 5013 + }, + { + "epoch": 0.7032258064516129, + "grad_norm": 1.6077696561897485, + "learning_rate": 2.13728815235834e-06, + "loss": 0.3243, + "step": 5014 + }, + { + "epoch": 0.7033660589060309, + "grad_norm": 3.6524656877205, + "learning_rate": 2.1354263065802627e-06, + "loss": 0.3282, + "step": 5015 + }, + { + "epoch": 0.7035063113604488, + "grad_norm": 1.8402118679055728, + "learning_rate": 2.1335650518869555e-06, + "loss": 0.3384, + "step": 5016 + }, + { + "epoch": 0.7036465638148668, + "grad_norm": 1.870062047877553, + "learning_rate": 2.1317043886624718e-06, + "loss": 0.3474, + "step": 5017 + }, + { + "epoch": 0.7037868162692847, + "grad_norm": 1.588845054129988, + "learning_rate": 2.1298443172907475e-06, + "loss": 0.335, + "step": 5018 + }, + { + "epoch": 0.7039270687237027, + "grad_norm": 3.3327880572331683, + "learning_rate": 2.127984838155598e-06, + "loss": 0.3245, + "step": 5019 + }, + { + "epoch": 0.7040673211781207, + "grad_norm": 1.7506758883297056, + "learning_rate": 2.1261259516407098e-06, + "loss": 0.33, + "step": 5020 + }, + { + "epoch": 0.7042075736325386, + "grad_norm": 1.7420627566441307, + "learning_rate": 2.1242676581296527e-06, + "loss": 0.3738, + "step": 5021 + }, + { + "epoch": 0.7043478260869566, + "grad_norm": 2.1044719084362273, + "learning_rate": 2.1224099580058734e-06, + "loss": 0.3428, + "step": 5022 + }, + { + "epoch": 0.7044880785413745, + "grad_norm": 1.5899683648087246, + "learning_rate": 2.120552851652694e-06, + "loss": 0.3215, + "step": 5023 + }, + { + "epoch": 0.7046283309957925, + "grad_norm": 2.245380533842817, + "learning_rate": 2.1186963394533165e-06, + "loss": 0.361, + "step": 5024 + }, + { + "epoch": 0.7047685834502104, + "grad_norm": 2.143175262725051, + "learning_rate": 2.1168404217908194e-06, + "loss": 0.3255, + "step": 5025 + }, + { + "epoch": 0.7049088359046284, + "grad_norm": 1.9523550728556962, + "learning_rate": 2.114985099048158e-06, + "loss": 0.3606, + "step": 5026 + }, + { + "epoch": 0.7050490883590463, + "grad_norm": 1.81453183170757, + "learning_rate": 2.113130371608165e-06, + "loss": 0.3773, + "step": 5027 + }, + { + "epoch": 0.7051893408134642, + "grad_norm": 1.5598531811656515, + "learning_rate": 2.111276239853552e-06, + "loss": 0.3217, + "step": 5028 + }, + { + "epoch": 0.7053295932678821, + "grad_norm": 1.768485461290956, + "learning_rate": 2.109422704166903e-06, + "loss": 0.3775, + "step": 5029 + }, + { + "epoch": 0.7054698457223001, + "grad_norm": 2.2490331903826744, + "learning_rate": 2.1075697649306838e-06, + "loss": 0.3283, + "step": 5030 + }, + { + "epoch": 0.705610098176718, + "grad_norm": 2.2071010342258375, + "learning_rate": 2.105717422527235e-06, + "loss": 0.2999, + "step": 5031 + }, + { + "epoch": 0.705750350631136, + "grad_norm": 1.97347528054505, + "learning_rate": 2.103865677338776e-06, + "loss": 0.351, + "step": 5032 + }, + { + "epoch": 0.705890603085554, + "grad_norm": 1.8673725754698942, + "learning_rate": 2.1020145297474003e-06, + "loss": 0.3367, + "step": 5033 + }, + { + "epoch": 0.7060308555399719, + "grad_norm": 2.1983279766897668, + "learning_rate": 2.1001639801350793e-06, + "loss": 0.3309, + "step": 5034 + }, + { + "epoch": 0.7061711079943899, + "grad_norm": 3.0904254968510263, + "learning_rate": 2.0983140288836607e-06, + "loss": 0.3457, + "step": 5035 + }, + { + "epoch": 0.7063113604488078, + "grad_norm": 2.3794811537529927, + "learning_rate": 2.0964646763748696e-06, + "loss": 0.37, + "step": 5036 + }, + { + "epoch": 0.7064516129032258, + "grad_norm": 2.3211278028888778, + "learning_rate": 2.094615922990309e-06, + "loss": 0.3297, + "step": 5037 + }, + { + "epoch": 0.7065918653576437, + "grad_norm": 2.467789958238873, + "learning_rate": 2.092767769111452e-06, + "loss": 0.3797, + "step": 5038 + }, + { + "epoch": 0.7067321178120617, + "grad_norm": 1.6758116961650011, + "learning_rate": 2.090920215119657e-06, + "loss": 0.3299, + "step": 5039 + }, + { + "epoch": 0.7068723702664796, + "grad_norm": 1.8454785407138252, + "learning_rate": 2.089073261396148e-06, + "loss": 0.3473, + "step": 5040 + }, + { + "epoch": 0.7070126227208976, + "grad_norm": 2.0545678629692774, + "learning_rate": 2.0872269083220346e-06, + "loss": 0.3501, + "step": 5041 + }, + { + "epoch": 0.7071528751753156, + "grad_norm": 2.129468483133855, + "learning_rate": 2.085381156278299e-06, + "loss": 0.3555, + "step": 5042 + }, + { + "epoch": 0.7072931276297335, + "grad_norm": 1.6214863864681273, + "learning_rate": 2.0835360056457983e-06, + "loss": 0.337, + "step": 5043 + }, + { + "epoch": 0.7074333800841515, + "grad_norm": 2.07729041758089, + "learning_rate": 2.0816914568052664e-06, + "loss": 0.4135, + "step": 5044 + }, + { + "epoch": 0.7075736325385694, + "grad_norm": 1.9305697134887765, + "learning_rate": 2.079847510137314e-06, + "loss": 0.3554, + "step": 5045 + }, + { + "epoch": 0.7077138849929874, + "grad_norm": 1.7000211377793173, + "learning_rate": 2.078004166022426e-06, + "loss": 0.3478, + "step": 5046 + }, + { + "epoch": 0.7078541374474053, + "grad_norm": 2.0439659587839825, + "learning_rate": 2.0761614248409635e-06, + "loss": 0.3086, + "step": 5047 + }, + { + "epoch": 0.7079943899018233, + "grad_norm": 2.127269318532695, + "learning_rate": 2.0743192869731655e-06, + "loss": 0.3327, + "step": 5048 + }, + { + "epoch": 0.7081346423562412, + "grad_norm": 2.3993226446514457, + "learning_rate": 2.07247775279914e-06, + "loss": 0.3253, + "step": 5049 + }, + { + "epoch": 0.7082748948106592, + "grad_norm": 1.73579252583626, + "learning_rate": 2.0706368226988772e-06, + "loss": 0.2972, + "step": 5050 + }, + { + "epoch": 0.7084151472650771, + "grad_norm": 1.946504220204907, + "learning_rate": 2.0687964970522394e-06, + "loss": 0.3529, + "step": 5051 + }, + { + "epoch": 0.7085553997194951, + "grad_norm": 2.1665721897165517, + "learning_rate": 2.066956776238966e-06, + "loss": 0.3016, + "step": 5052 + }, + { + "epoch": 0.7086956521739131, + "grad_norm": 1.818096531146455, + "learning_rate": 2.0651176606386697e-06, + "loss": 0.3423, + "step": 5053 + }, + { + "epoch": 0.708835904628331, + "grad_norm": 1.7765876251564428, + "learning_rate": 2.06327915063084e-06, + "loss": 0.3325, + "step": 5054 + }, + { + "epoch": 0.708976157082749, + "grad_norm": 1.6127443320701487, + "learning_rate": 2.0614412465948392e-06, + "loss": 0.3415, + "step": 5055 + }, + { + "epoch": 0.7091164095371669, + "grad_norm": 1.841172760834422, + "learning_rate": 2.0596039489099066e-06, + "loss": 0.3227, + "step": 5056 + }, + { + "epoch": 0.7092566619915849, + "grad_norm": 2.0288886796956525, + "learning_rate": 2.057767257955157e-06, + "loss": 0.3243, + "step": 5057 + }, + { + "epoch": 0.7093969144460028, + "grad_norm": 2.589526955401833, + "learning_rate": 2.055931174109579e-06, + "loss": 0.2918, + "step": 5058 + }, + { + "epoch": 0.7095371669004208, + "grad_norm": 2.176714219435069, + "learning_rate": 2.054095697752032e-06, + "loss": 0.3591, + "step": 5059 + }, + { + "epoch": 0.7096774193548387, + "grad_norm": 4.561776926930897, + "learning_rate": 2.0522608292612583e-06, + "loss": 0.3153, + "step": 5060 + }, + { + "epoch": 0.7098176718092567, + "grad_norm": 1.909942427761125, + "learning_rate": 2.050426569015866e-06, + "loss": 0.3624, + "step": 5061 + }, + { + "epoch": 0.7099579242636747, + "grad_norm": 1.9319457149737589, + "learning_rate": 2.0485929173943436e-06, + "loss": 0.3198, + "step": 5062 + }, + { + "epoch": 0.7100981767180926, + "grad_norm": 1.8323259909579115, + "learning_rate": 2.0467598747750533e-06, + "loss": 0.3358, + "step": 5063 + }, + { + "epoch": 0.7102384291725106, + "grad_norm": 1.9768674713194019, + "learning_rate": 2.044927441536229e-06, + "loss": 0.3771, + "step": 5064 + }, + { + "epoch": 0.7103786816269285, + "grad_norm": 3.681989917324834, + "learning_rate": 2.043095618055982e-06, + "loss": 0.3515, + "step": 5065 + }, + { + "epoch": 0.7105189340813465, + "grad_norm": 2.0974489248102084, + "learning_rate": 2.0412644047122953e-06, + "loss": 0.3675, + "step": 5066 + }, + { + "epoch": 0.7106591865357644, + "grad_norm": 1.506785937683091, + "learning_rate": 2.039433801883027e-06, + "loss": 0.3187, + "step": 5067 + }, + { + "epoch": 0.7107994389901823, + "grad_norm": 1.8397792148813068, + "learning_rate": 2.0376038099459104e-06, + "loss": 0.3792, + "step": 5068 + }, + { + "epoch": 0.7109396914446002, + "grad_norm": 2.7023361025081964, + "learning_rate": 2.035774429278552e-06, + "loss": 0.3168, + "step": 5069 + }, + { + "epoch": 0.7110799438990182, + "grad_norm": 2.156760666679022, + "learning_rate": 2.033945660258429e-06, + "loss": 0.3038, + "step": 5070 + }, + { + "epoch": 0.7112201963534361, + "grad_norm": 2.145730050079679, + "learning_rate": 2.032117503262896e-06, + "loss": 0.3655, + "step": 5071 + }, + { + "epoch": 0.7113604488078541, + "grad_norm": 2.192895610963617, + "learning_rate": 2.030289958669181e-06, + "loss": 0.3168, + "step": 5072 + }, + { + "epoch": 0.711500701262272, + "grad_norm": 2.448345898811741, + "learning_rate": 2.0284630268543853e-06, + "loss": 0.3575, + "step": 5073 + }, + { + "epoch": 0.71164095371669, + "grad_norm": 2.8186177157130663, + "learning_rate": 2.026636708195483e-06, + "loss": 0.3079, + "step": 5074 + }, + { + "epoch": 0.711781206171108, + "grad_norm": 1.4530044013401062, + "learning_rate": 2.0248110030693223e-06, + "loss": 0.3223, + "step": 5075 + }, + { + "epoch": 0.7119214586255259, + "grad_norm": 1.8277664485040477, + "learning_rate": 2.0229859118526244e-06, + "loss": 0.3405, + "step": 5076 + }, + { + "epoch": 0.7120617110799439, + "grad_norm": 2.3654508861731753, + "learning_rate": 2.0211614349219855e-06, + "loss": 0.3208, + "step": 5077 + }, + { + "epoch": 0.7122019635343618, + "grad_norm": 2.0013950965791545, + "learning_rate": 2.0193375726538737e-06, + "loss": 0.3498, + "step": 5078 + }, + { + "epoch": 0.7123422159887798, + "grad_norm": 1.857968800404947, + "learning_rate": 2.0175143254246277e-06, + "loss": 0.3568, + "step": 5079 + }, + { + "epoch": 0.7124824684431977, + "grad_norm": 3.6731879301653625, + "learning_rate": 2.0156916936104654e-06, + "loss": 0.3397, + "step": 5080 + }, + { + "epoch": 0.7126227208976157, + "grad_norm": 1.8642040982308885, + "learning_rate": 2.01386967758747e-06, + "loss": 0.3229, + "step": 5081 + }, + { + "epoch": 0.7127629733520336, + "grad_norm": 2.046725435106983, + "learning_rate": 2.012048277731604e-06, + "loss": 0.3416, + "step": 5082 + }, + { + "epoch": 0.7129032258064516, + "grad_norm": 1.8774035624843246, + "learning_rate": 2.0102274944187005e-06, + "loss": 0.3433, + "step": 5083 + }, + { + "epoch": 0.7130434782608696, + "grad_norm": 2.9286754436828866, + "learning_rate": 2.008407328024465e-06, + "loss": 0.3543, + "step": 5084 + }, + { + "epoch": 0.7131837307152875, + "grad_norm": 2.4573121509180638, + "learning_rate": 2.0065877789244762e-06, + "loss": 0.3134, + "step": 5085 + }, + { + "epoch": 0.7133239831697055, + "grad_norm": 1.9388622294247575, + "learning_rate": 2.004768847494186e-06, + "loss": 0.3316, + "step": 5086 + }, + { + "epoch": 0.7134642356241234, + "grad_norm": 2.3780639359884086, + "learning_rate": 2.0029505341089183e-06, + "loss": 0.3465, + "step": 5087 + }, + { + "epoch": 0.7136044880785414, + "grad_norm": 1.7964356258419587, + "learning_rate": 2.0011328391438685e-06, + "loss": 0.2894, + "step": 5088 + }, + { + "epoch": 0.7137447405329593, + "grad_norm": 1.973605511083435, + "learning_rate": 1.999315762974107e-06, + "loss": 0.354, + "step": 5089 + }, + { + "epoch": 0.7138849929873773, + "grad_norm": 3.554051738519123, + "learning_rate": 1.997499305974572e-06, + "loss": 0.323, + "step": 5090 + }, + { + "epoch": 0.7140252454417952, + "grad_norm": 2.675031704581072, + "learning_rate": 1.9956834685200778e-06, + "loss": 0.3995, + "step": 5091 + }, + { + "epoch": 0.7141654978962132, + "grad_norm": 1.8131502234812635, + "learning_rate": 1.9938682509853097e-06, + "loss": 0.393, + "step": 5092 + }, + { + "epoch": 0.7143057503506312, + "grad_norm": 1.989104874152798, + "learning_rate": 1.992053653744826e-06, + "loss": 0.3895, + "step": 5093 + }, + { + "epoch": 0.7144460028050491, + "grad_norm": 1.628787980943264, + "learning_rate": 1.990239677173056e-06, + "loss": 0.3497, + "step": 5094 + }, + { + "epoch": 0.7145862552594671, + "grad_norm": 2.4860239374753004, + "learning_rate": 1.9884263216443002e-06, + "loss": 0.2967, + "step": 5095 + }, + { + "epoch": 0.714726507713885, + "grad_norm": 1.7492160403531798, + "learning_rate": 1.9866135875327325e-06, + "loss": 0.379, + "step": 5096 + }, + { + "epoch": 0.714866760168303, + "grad_norm": 3.971354673625309, + "learning_rate": 1.9848014752123977e-06, + "loss": 0.3841, + "step": 5097 + }, + { + "epoch": 0.7150070126227209, + "grad_norm": 2.167217172475929, + "learning_rate": 1.982989985057213e-06, + "loss": 0.3422, + "step": 5098 + }, + { + "epoch": 0.7151472650771389, + "grad_norm": 3.3701777445841534, + "learning_rate": 1.9811791174409676e-06, + "loss": 0.3379, + "step": 5099 + }, + { + "epoch": 0.7152875175315568, + "grad_norm": 1.9345081363643923, + "learning_rate": 1.979368872737319e-06, + "loss": 0.3364, + "step": 5100 + }, + { + "epoch": 0.7154277699859748, + "grad_norm": 2.008132493461925, + "learning_rate": 1.9775592513198015e-06, + "loss": 0.334, + "step": 5101 + }, + { + "epoch": 0.7155680224403927, + "grad_norm": 1.938069905786888, + "learning_rate": 1.9757502535618137e-06, + "loss": 0.3427, + "step": 5102 + }, + { + "epoch": 0.7157082748948107, + "grad_norm": 1.7472972434628686, + "learning_rate": 1.973941879836633e-06, + "loss": 0.2977, + "step": 5103 + }, + { + "epoch": 0.7158485273492287, + "grad_norm": 1.900339123928792, + "learning_rate": 1.9721341305174025e-06, + "loss": 0.3047, + "step": 5104 + }, + { + "epoch": 0.7159887798036466, + "grad_norm": 2.1029331885013116, + "learning_rate": 1.9703270059771406e-06, + "loss": 0.3781, + "step": 5105 + }, + { + "epoch": 0.7161290322580646, + "grad_norm": 2.111016433109741, + "learning_rate": 1.9685205065887336e-06, + "loss": 0.356, + "step": 5106 + }, + { + "epoch": 0.7162692847124825, + "grad_norm": 2.310444201576799, + "learning_rate": 1.966714632724941e-06, + "loss": 0.385, + "step": 5107 + }, + { + "epoch": 0.7164095371669004, + "grad_norm": 1.799053298160675, + "learning_rate": 1.964909384758391e-06, + "loss": 0.3372, + "step": 5108 + }, + { + "epoch": 0.7165497896213183, + "grad_norm": 2.0032243962852885, + "learning_rate": 1.963104763061585e-06, + "loss": 0.3212, + "step": 5109 + }, + { + "epoch": 0.7166900420757363, + "grad_norm": 2.4295163600297784, + "learning_rate": 1.9613007680068957e-06, + "loss": 0.3218, + "step": 5110 + }, + { + "epoch": 0.7168302945301542, + "grad_norm": 2.6042331127590748, + "learning_rate": 1.959497399966561e-06, + "loss": 0.329, + "step": 5111 + }, + { + "epoch": 0.7169705469845722, + "grad_norm": 1.9213926749034111, + "learning_rate": 1.957694659312695e-06, + "loss": 0.3484, + "step": 5112 + }, + { + "epoch": 0.7171107994389901, + "grad_norm": 2.014328413900129, + "learning_rate": 1.955892546417281e-06, + "loss": 0.3194, + "step": 5113 + }, + { + "epoch": 0.7172510518934081, + "grad_norm": 1.9335946747748736, + "learning_rate": 1.954091061652172e-06, + "loss": 0.3013, + "step": 5114 + }, + { + "epoch": 0.717391304347826, + "grad_norm": 2.0745085030742567, + "learning_rate": 1.9522902053890925e-06, + "loss": 0.3635, + "step": 5115 + }, + { + "epoch": 0.717531556802244, + "grad_norm": 2.1666974163318717, + "learning_rate": 1.9504899779996354e-06, + "loss": 0.3312, + "step": 5116 + }, + { + "epoch": 0.717671809256662, + "grad_norm": 1.9068403736872883, + "learning_rate": 1.9486903798552665e-06, + "loss": 0.3671, + "step": 5117 + }, + { + "epoch": 0.7178120617110799, + "grad_norm": 1.5146708680489749, + "learning_rate": 1.946891411327319e-06, + "loss": 0.2997, + "step": 5118 + }, + { + "epoch": 0.7179523141654979, + "grad_norm": 1.7557612000240834, + "learning_rate": 1.9450930727870004e-06, + "loss": 0.3106, + "step": 5119 + }, + { + "epoch": 0.7180925666199158, + "grad_norm": 2.1320388005431523, + "learning_rate": 1.943295364605381e-06, + "loss": 0.3366, + "step": 5120 + }, + { + "epoch": 0.7182328190743338, + "grad_norm": 1.6361367731864962, + "learning_rate": 1.941498287153409e-06, + "loss": 0.3276, + "step": 5121 + }, + { + "epoch": 0.7183730715287517, + "grad_norm": 1.800253437002403, + "learning_rate": 1.9397018408018947e-06, + "loss": 0.3651, + "step": 5122 + }, + { + "epoch": 0.7185133239831697, + "grad_norm": 1.830366732741697, + "learning_rate": 1.9379060259215255e-06, + "loss": 0.323, + "step": 5123 + }, + { + "epoch": 0.7186535764375876, + "grad_norm": 1.8870877681025053, + "learning_rate": 1.936110842882854e-06, + "loss": 0.3439, + "step": 5124 + }, + { + "epoch": 0.7187938288920056, + "grad_norm": 1.8852424611354548, + "learning_rate": 1.934316292056304e-06, + "loss": 0.3496, + "step": 5125 + }, + { + "epoch": 0.7189340813464236, + "grad_norm": 1.6054238269024805, + "learning_rate": 1.9325223738121685e-06, + "loss": 0.306, + "step": 5126 + }, + { + "epoch": 0.7190743338008415, + "grad_norm": 2.1570231944938545, + "learning_rate": 1.9307290885206102e-06, + "loss": 0.342, + "step": 5127 + }, + { + "epoch": 0.7192145862552595, + "grad_norm": 2.6756731665841893, + "learning_rate": 1.928936436551661e-06, + "loss": 0.3238, + "step": 5128 + }, + { + "epoch": 0.7193548387096774, + "grad_norm": 1.8203018908539417, + "learning_rate": 1.927144418275222e-06, + "loss": 0.3078, + "step": 5129 + }, + { + "epoch": 0.7194950911640954, + "grad_norm": 2.59700387205732, + "learning_rate": 1.925353034061065e-06, + "loss": 0.3662, + "step": 5130 + }, + { + "epoch": 0.7196353436185133, + "grad_norm": 2.035901187739777, + "learning_rate": 1.9235622842788264e-06, + "loss": 0.3155, + "step": 5131 + }, + { + "epoch": 0.7197755960729313, + "grad_norm": 1.7428021223509278, + "learning_rate": 1.9217721692980172e-06, + "loss": 0.3596, + "step": 5132 + }, + { + "epoch": 0.7199158485273492, + "grad_norm": 1.8377145810596403, + "learning_rate": 1.9199826894880147e-06, + "loss": 0.3281, + "step": 5133 + }, + { + "epoch": 0.7200561009817672, + "grad_norm": 2.692423192365857, + "learning_rate": 1.9181938452180654e-06, + "loss": 0.3103, + "step": 5134 + }, + { + "epoch": 0.7201963534361852, + "grad_norm": 2.707137641082121, + "learning_rate": 1.9164056368572847e-06, + "loss": 0.3334, + "step": 5135 + }, + { + "epoch": 0.7203366058906031, + "grad_norm": 1.8280596599150534, + "learning_rate": 1.9146180647746575e-06, + "loss": 0.3524, + "step": 5136 + }, + { + "epoch": 0.7204768583450211, + "grad_norm": 2.908479389214046, + "learning_rate": 1.9128311293390362e-06, + "loss": 0.3193, + "step": 5137 + }, + { + "epoch": 0.720617110799439, + "grad_norm": 2.7801014103677426, + "learning_rate": 1.9110448309191428e-06, + "loss": 0.3171, + "step": 5138 + }, + { + "epoch": 0.720757363253857, + "grad_norm": 2.1351596088292277, + "learning_rate": 1.9092591698835673e-06, + "loss": 0.3616, + "step": 5139 + }, + { + "epoch": 0.7208976157082749, + "grad_norm": 2.304026880789615, + "learning_rate": 1.90747414660077e-06, + "loss": 0.3552, + "step": 5140 + }, + { + "epoch": 0.7210378681626929, + "grad_norm": 2.2322983060931785, + "learning_rate": 1.905689761439075e-06, + "loss": 0.2825, + "step": 5141 + }, + { + "epoch": 0.7211781206171108, + "grad_norm": 1.7348705649122362, + "learning_rate": 1.903906014766681e-06, + "loss": 0.3432, + "step": 5142 + }, + { + "epoch": 0.7213183730715288, + "grad_norm": 2.171638750358942, + "learning_rate": 1.9021229069516477e-06, + "loss": 0.3781, + "step": 5143 + }, + { + "epoch": 0.7214586255259468, + "grad_norm": 2.0720818785981865, + "learning_rate": 1.9003404383619094e-06, + "loss": 0.3608, + "step": 5144 + }, + { + "epoch": 0.7215988779803647, + "grad_norm": 2.9442813596206077, + "learning_rate": 1.8985586093652658e-06, + "loss": 0.3103, + "step": 5145 + }, + { + "epoch": 0.7217391304347827, + "grad_norm": 2.0272327311847347, + "learning_rate": 1.8967774203293843e-06, + "loss": 0.3485, + "step": 5146 + }, + { + "epoch": 0.7218793828892006, + "grad_norm": 1.7557215482700594, + "learning_rate": 1.894996871621802e-06, + "loss": 0.3473, + "step": 5147 + }, + { + "epoch": 0.7220196353436185, + "grad_norm": 1.9601626746147267, + "learning_rate": 1.8932169636099213e-06, + "loss": 0.3323, + "step": 5148 + }, + { + "epoch": 0.7221598877980364, + "grad_norm": 1.6426349498522037, + "learning_rate": 1.891437696661015e-06, + "loss": 0.3117, + "step": 5149 + }, + { + "epoch": 0.7223001402524544, + "grad_norm": 1.9908866075637346, + "learning_rate": 1.8896590711422215e-06, + "loss": 0.3601, + "step": 5150 + }, + { + "epoch": 0.7224403927068723, + "grad_norm": 2.065296819935879, + "learning_rate": 1.8878810874205494e-06, + "loss": 0.2681, + "step": 5151 + }, + { + "epoch": 0.7225806451612903, + "grad_norm": 1.5103534378322283, + "learning_rate": 1.8861037458628712e-06, + "loss": 0.3068, + "step": 5152 + }, + { + "epoch": 0.7227208976157082, + "grad_norm": 2.3221881105978897, + "learning_rate": 1.8843270468359287e-06, + "loss": 0.3574, + "step": 5153 + }, + { + "epoch": 0.7228611500701262, + "grad_norm": 2.4639939477028765, + "learning_rate": 1.8825509907063328e-06, + "loss": 0.34, + "step": 5154 + }, + { + "epoch": 0.7230014025245441, + "grad_norm": 1.808272924454319, + "learning_rate": 1.8807755778405596e-06, + "loss": 0.3777, + "step": 5155 + }, + { + "epoch": 0.7231416549789621, + "grad_norm": 1.9720768748111204, + "learning_rate": 1.8790008086049534e-06, + "loss": 0.3331, + "step": 5156 + }, + { + "epoch": 0.7232819074333801, + "grad_norm": 3.604243342001116, + "learning_rate": 1.8772266833657254e-06, + "loss": 0.3318, + "step": 5157 + }, + { + "epoch": 0.723422159887798, + "grad_norm": 1.601873302540678, + "learning_rate": 1.8754532024889537e-06, + "loss": 0.3413, + "step": 5158 + }, + { + "epoch": 0.723562412342216, + "grad_norm": 3.661717947831014, + "learning_rate": 1.873680366340584e-06, + "loss": 0.3657, + "step": 5159 + }, + { + "epoch": 0.7237026647966339, + "grad_norm": 1.4857368245484992, + "learning_rate": 1.8719081752864298e-06, + "loss": 0.361, + "step": 5160 + }, + { + "epoch": 0.7238429172510519, + "grad_norm": 3.4033823803970757, + "learning_rate": 1.8701366296921675e-06, + "loss": 0.3264, + "step": 5161 + }, + { + "epoch": 0.7239831697054698, + "grad_norm": 2.1102636655254976, + "learning_rate": 1.8683657299233464e-06, + "loss": 0.3389, + "step": 5162 + }, + { + "epoch": 0.7241234221598878, + "grad_norm": 2.1942275003107854, + "learning_rate": 1.8665954763453764e-06, + "loss": 0.3747, + "step": 5163 + }, + { + "epoch": 0.7242636746143057, + "grad_norm": 1.6916834932188847, + "learning_rate": 1.8648258693235376e-06, + "loss": 0.3201, + "step": 5164 + }, + { + "epoch": 0.7244039270687237, + "grad_norm": 1.9092961970923215, + "learning_rate": 1.8630569092229766e-06, + "loss": 0.3531, + "step": 5165 + }, + { + "epoch": 0.7245441795231417, + "grad_norm": 2.9038233316450657, + "learning_rate": 1.8612885964087063e-06, + "loss": 0.3551, + "step": 5166 + }, + { + "epoch": 0.7246844319775596, + "grad_norm": 1.865516329720176, + "learning_rate": 1.8595209312456052e-06, + "loss": 0.3326, + "step": 5167 + }, + { + "epoch": 0.7248246844319776, + "grad_norm": 1.5641212444725243, + "learning_rate": 1.857753914098419e-06, + "loss": 0.3483, + "step": 5168 + }, + { + "epoch": 0.7249649368863955, + "grad_norm": 1.927130607506261, + "learning_rate": 1.8559875453317588e-06, + "loss": 0.3485, + "step": 5169 + }, + { + "epoch": 0.7251051893408135, + "grad_norm": 1.6542935602184174, + "learning_rate": 1.854221825310103e-06, + "loss": 0.359, + "step": 5170 + }, + { + "epoch": 0.7252454417952314, + "grad_norm": 2.1849017379978006, + "learning_rate": 1.8524567543977973e-06, + "loss": 0.3351, + "step": 5171 + }, + { + "epoch": 0.7253856942496494, + "grad_norm": 1.878083618012832, + "learning_rate": 1.8506923329590482e-06, + "loss": 0.3103, + "step": 5172 + }, + { + "epoch": 0.7255259467040673, + "grad_norm": 2.2461141303423204, + "learning_rate": 1.8489285613579328e-06, + "loss": 0.3305, + "step": 5173 + }, + { + "epoch": 0.7256661991584853, + "grad_norm": 2.0265537847068282, + "learning_rate": 1.8471654399583938e-06, + "loss": 0.3354, + "step": 5174 + }, + { + "epoch": 0.7258064516129032, + "grad_norm": 2.1371733968073907, + "learning_rate": 1.8454029691242392e-06, + "loss": 0.3487, + "step": 5175 + }, + { + "epoch": 0.7259467040673212, + "grad_norm": 2.0998881310230026, + "learning_rate": 1.843641149219142e-06, + "loss": 0.3026, + "step": 5176 + }, + { + "epoch": 0.7260869565217392, + "grad_norm": 1.7197930378930555, + "learning_rate": 1.8418799806066413e-06, + "loss": 0.3323, + "step": 5177 + }, + { + "epoch": 0.7262272089761571, + "grad_norm": 1.6163411897570767, + "learning_rate": 1.8401194636501424e-06, + "loss": 0.2911, + "step": 5178 + }, + { + "epoch": 0.7263674614305751, + "grad_norm": 1.9477384217116067, + "learning_rate": 1.8383595987129155e-06, + "loss": 0.3676, + "step": 5179 + }, + { + "epoch": 0.726507713884993, + "grad_norm": 1.6835785265453334, + "learning_rate": 1.8366003861580966e-06, + "loss": 0.2978, + "step": 5180 + }, + { + "epoch": 0.726647966339411, + "grad_norm": 2.1312936817514228, + "learning_rate": 1.8348418263486884e-06, + "loss": 0.3647, + "step": 5181 + }, + { + "epoch": 0.7267882187938289, + "grad_norm": 2.2603974114460756, + "learning_rate": 1.8330839196475542e-06, + "loss": 0.3258, + "step": 5182 + }, + { + "epoch": 0.7269284712482469, + "grad_norm": 1.688695131168653, + "learning_rate": 1.831326666417429e-06, + "loss": 0.3592, + "step": 5183 + }, + { + "epoch": 0.7270687237026648, + "grad_norm": 2.3815822739950887, + "learning_rate": 1.829570067020906e-06, + "loss": 0.3434, + "step": 5184 + }, + { + "epoch": 0.7272089761570828, + "grad_norm": 2.0811552742144506, + "learning_rate": 1.8278141218204499e-06, + "loss": 0.3305, + "step": 5185 + }, + { + "epoch": 0.7273492286115008, + "grad_norm": 2.1815690427561, + "learning_rate": 1.8260588311783866e-06, + "loss": 0.3611, + "step": 5186 + }, + { + "epoch": 0.7274894810659187, + "grad_norm": 2.0354904515439065, + "learning_rate": 1.8243041954569085e-06, + "loss": 0.3577, + "step": 5187 + }, + { + "epoch": 0.7276297335203366, + "grad_norm": 1.9442601555579526, + "learning_rate": 1.822550215018073e-06, + "loss": 0.3623, + "step": 5188 + }, + { + "epoch": 0.7277699859747545, + "grad_norm": 2.144165230177413, + "learning_rate": 1.820796890223801e-06, + "loss": 0.3534, + "step": 5189 + }, + { + "epoch": 0.7279102384291725, + "grad_norm": 2.1672147522239413, + "learning_rate": 1.8190442214358788e-06, + "loss": 0.3477, + "step": 5190 + }, + { + "epoch": 0.7280504908835904, + "grad_norm": 2.4162481098986643, + "learning_rate": 1.8172922090159578e-06, + "loss": 0.3275, + "step": 5191 + }, + { + "epoch": 0.7281907433380084, + "grad_norm": 2.2502103283380053, + "learning_rate": 1.8155408533255553e-06, + "loss": 0.3081, + "step": 5192 + }, + { + "epoch": 0.7283309957924263, + "grad_norm": 1.9128511293496415, + "learning_rate": 1.8137901547260472e-06, + "loss": 0.3869, + "step": 5193 + }, + { + "epoch": 0.7284712482468443, + "grad_norm": 2.136809017638813, + "learning_rate": 1.8120401135786803e-06, + "loss": 0.3349, + "step": 5194 + }, + { + "epoch": 0.7286115007012622, + "grad_norm": 2.0280648714859497, + "learning_rate": 1.8102907302445627e-06, + "loss": 0.3153, + "step": 5195 + }, + { + "epoch": 0.7287517531556802, + "grad_norm": 2.16892233831681, + "learning_rate": 1.808542005084668e-06, + "loss": 0.3486, + "step": 5196 + }, + { + "epoch": 0.7288920056100981, + "grad_norm": 1.7990733092438704, + "learning_rate": 1.8067939384598337e-06, + "loss": 0.3267, + "step": 5197 + }, + { + "epoch": 0.7290322580645161, + "grad_norm": 1.7380560225400656, + "learning_rate": 1.8050465307307602e-06, + "loss": 0.3708, + "step": 5198 + }, + { + "epoch": 0.7291725105189341, + "grad_norm": 1.5623925744321192, + "learning_rate": 1.8032997822580139e-06, + "loss": 0.3478, + "step": 5199 + }, + { + "epoch": 0.729312762973352, + "grad_norm": 2.056504407177262, + "learning_rate": 1.8015536934020229e-06, + "loss": 0.3331, + "step": 5200 + }, + { + "epoch": 0.72945301542777, + "grad_norm": 1.7019922531704705, + "learning_rate": 1.7998082645230835e-06, + "loss": 0.3402, + "step": 5201 + }, + { + "epoch": 0.7295932678821879, + "grad_norm": 1.878817613695927, + "learning_rate": 1.798063495981348e-06, + "loss": 0.3377, + "step": 5202 + }, + { + "epoch": 0.7297335203366059, + "grad_norm": 1.989660058531053, + "learning_rate": 1.7963193881368402e-06, + "loss": 0.3692, + "step": 5203 + }, + { + "epoch": 0.7298737727910238, + "grad_norm": 1.8423269566885208, + "learning_rate": 1.7945759413494458e-06, + "loss": 0.2823, + "step": 5204 + }, + { + "epoch": 0.7300140252454418, + "grad_norm": 1.863589087434298, + "learning_rate": 1.7928331559789087e-06, + "loss": 0.3426, + "step": 5205 + }, + { + "epoch": 0.7301542776998597, + "grad_norm": 2.458546472997347, + "learning_rate": 1.7910910323848435e-06, + "loss": 0.3589, + "step": 5206 + }, + { + "epoch": 0.7302945301542777, + "grad_norm": 2.2911234356293506, + "learning_rate": 1.789349570926724e-06, + "loss": 0.3563, + "step": 5207 + }, + { + "epoch": 0.7304347826086957, + "grad_norm": 1.830823553120265, + "learning_rate": 1.7876087719638896e-06, + "loss": 0.3302, + "step": 5208 + }, + { + "epoch": 0.7305750350631136, + "grad_norm": 1.709315676592499, + "learning_rate": 1.7858686358555411e-06, + "loss": 0.3029, + "step": 5209 + }, + { + "epoch": 0.7307152875175316, + "grad_norm": 2.3108799388801975, + "learning_rate": 1.7841291629607443e-06, + "loss": 0.3096, + "step": 5210 + }, + { + "epoch": 0.7308555399719495, + "grad_norm": 1.8611165073500004, + "learning_rate": 1.7823903536384262e-06, + "loss": 0.3729, + "step": 5211 + }, + { + "epoch": 0.7309957924263675, + "grad_norm": 2.558811044565814, + "learning_rate": 1.7806522082473809e-06, + "loss": 0.3111, + "step": 5212 + }, + { + "epoch": 0.7311360448807854, + "grad_norm": 3.380450756782409, + "learning_rate": 1.7789147271462586e-06, + "loss": 0.3243, + "step": 5213 + }, + { + "epoch": 0.7312762973352034, + "grad_norm": 1.8097226140036573, + "learning_rate": 1.7771779106935783e-06, + "loss": 0.297, + "step": 5214 + }, + { + "epoch": 0.7314165497896213, + "grad_norm": 1.9066193019657003, + "learning_rate": 1.7754417592477192e-06, + "loss": 0.3755, + "step": 5215 + }, + { + "epoch": 0.7315568022440393, + "grad_norm": 1.7265130635774562, + "learning_rate": 1.7737062731669246e-06, + "loss": 0.3586, + "step": 5216 + }, + { + "epoch": 0.7316970546984572, + "grad_norm": 4.992608137721776, + "learning_rate": 1.7719714528093e-06, + "loss": 0.3424, + "step": 5217 + }, + { + "epoch": 0.7318373071528752, + "grad_norm": 1.901539469997147, + "learning_rate": 1.7702372985328132e-06, + "loss": 0.3304, + "step": 5218 + }, + { + "epoch": 0.7319775596072932, + "grad_norm": 2.302248113151951, + "learning_rate": 1.7685038106952952e-06, + "loss": 0.4008, + "step": 5219 + }, + { + "epoch": 0.7321178120617111, + "grad_norm": 2.8534623179590453, + "learning_rate": 1.766770989654439e-06, + "loss": 0.3638, + "step": 5220 + }, + { + "epoch": 0.7322580645161291, + "grad_norm": 1.9951256720558088, + "learning_rate": 1.7650388357677994e-06, + "loss": 0.346, + "step": 5221 + }, + { + "epoch": 0.732398316970547, + "grad_norm": 2.360981577392984, + "learning_rate": 1.7633073493927965e-06, + "loss": 0.3475, + "step": 5222 + }, + { + "epoch": 0.732538569424965, + "grad_norm": 1.7721064609812565, + "learning_rate": 1.7615765308867071e-06, + "loss": 0.3171, + "step": 5223 + }, + { + "epoch": 0.7326788218793829, + "grad_norm": 2.448313958143518, + "learning_rate": 1.7598463806066774e-06, + "loss": 0.3211, + "step": 5224 + }, + { + "epoch": 0.7328190743338009, + "grad_norm": 2.8386204326562154, + "learning_rate": 1.7581168989097075e-06, + "loss": 0.3333, + "step": 5225 + }, + { + "epoch": 0.7329593267882188, + "grad_norm": 2.2743286274005827, + "learning_rate": 1.7563880861526656e-06, + "loss": 0.3332, + "step": 5226 + }, + { + "epoch": 0.7330995792426368, + "grad_norm": 3.3518747748880418, + "learning_rate": 1.7546599426922812e-06, + "loss": 0.3283, + "step": 5227 + }, + { + "epoch": 0.7332398316970546, + "grad_norm": 2.431451776364008, + "learning_rate": 1.7529324688851429e-06, + "loss": 0.3652, + "step": 5228 + }, + { + "epoch": 0.7333800841514726, + "grad_norm": 2.079936220585059, + "learning_rate": 1.7512056650877047e-06, + "loss": 0.3381, + "step": 5229 + }, + { + "epoch": 0.7335203366058906, + "grad_norm": 1.5573857341751294, + "learning_rate": 1.7494795316562791e-06, + "loss": 0.2989, + "step": 5230 + }, + { + "epoch": 0.7336605890603085, + "grad_norm": 2.1574642798162063, + "learning_rate": 1.7477540689470424e-06, + "loss": 0.3684, + "step": 5231 + }, + { + "epoch": 0.7338008415147265, + "grad_norm": 2.788714218833427, + "learning_rate": 1.7460292773160315e-06, + "loss": 0.3587, + "step": 5232 + }, + { + "epoch": 0.7339410939691444, + "grad_norm": 2.117539042323461, + "learning_rate": 1.7443051571191472e-06, + "loss": 0.3351, + "step": 5233 + }, + { + "epoch": 0.7340813464235624, + "grad_norm": 2.084264305641265, + "learning_rate": 1.7425817087121455e-06, + "loss": 0.3457, + "step": 5234 + }, + { + "epoch": 0.7342215988779803, + "grad_norm": 2.4439564805471874, + "learning_rate": 1.7408589324506504e-06, + "loss": 0.3697, + "step": 5235 + }, + { + "epoch": 0.7343618513323983, + "grad_norm": 3.301799549379433, + "learning_rate": 1.7391368286901444e-06, + "loss": 0.3731, + "step": 5236 + }, + { + "epoch": 0.7345021037868162, + "grad_norm": 5.560056170996278, + "learning_rate": 1.7374153977859715e-06, + "loss": 0.3013, + "step": 5237 + }, + { + "epoch": 0.7346423562412342, + "grad_norm": 2.2043678619727785, + "learning_rate": 1.7356946400933373e-06, + "loss": 0.3138, + "step": 5238 + }, + { + "epoch": 0.7347826086956522, + "grad_norm": 2.0351061091771974, + "learning_rate": 1.7339745559673071e-06, + "loss": 0.3386, + "step": 5239 + }, + { + "epoch": 0.7349228611500701, + "grad_norm": 1.7182250868590931, + "learning_rate": 1.73225514576281e-06, + "loss": 0.332, + "step": 5240 + }, + { + "epoch": 0.7350631136044881, + "grad_norm": 2.1520080858525388, + "learning_rate": 1.7305364098346328e-06, + "loss": 0.3192, + "step": 5241 + }, + { + "epoch": 0.735203366058906, + "grad_norm": 1.7805447437092352, + "learning_rate": 1.7288183485374267e-06, + "loss": 0.3869, + "step": 5242 + }, + { + "epoch": 0.735343618513324, + "grad_norm": 1.7518191419250488, + "learning_rate": 1.7271009622256985e-06, + "loss": 0.3287, + "step": 5243 + }, + { + "epoch": 0.7354838709677419, + "grad_norm": 2.413231864169483, + "learning_rate": 1.7253842512538204e-06, + "loss": 0.3524, + "step": 5244 + }, + { + "epoch": 0.7356241234221599, + "grad_norm": 3.6427293188712175, + "learning_rate": 1.723668215976026e-06, + "loss": 0.3474, + "step": 5245 + }, + { + "epoch": 0.7357643758765778, + "grad_norm": 1.7375320019965204, + "learning_rate": 1.7219528567464028e-06, + "loss": 0.3046, + "step": 5246 + }, + { + "epoch": 0.7359046283309958, + "grad_norm": 2.106907284109343, + "learning_rate": 1.7202381739189055e-06, + "loss": 0.308, + "step": 5247 + }, + { + "epoch": 0.7360448807854137, + "grad_norm": 3.010538664109083, + "learning_rate": 1.7185241678473468e-06, + "loss": 0.3215, + "step": 5248 + }, + { + "epoch": 0.7361851332398317, + "grad_norm": 1.690245055765956, + "learning_rate": 1.7168108388853999e-06, + "loss": 0.2991, + "step": 5249 + }, + { + "epoch": 0.7363253856942497, + "grad_norm": 2.196055765003282, + "learning_rate": 1.7150981873865979e-06, + "loss": 0.404, + "step": 5250 + }, + { + "epoch": 0.7364656381486676, + "grad_norm": 2.374275900230434, + "learning_rate": 1.713386213704335e-06, + "loss": 0.3297, + "step": 5251 + }, + { + "epoch": 0.7366058906030856, + "grad_norm": 1.8325085374079029, + "learning_rate": 1.7116749181918652e-06, + "loss": 0.339, + "step": 5252 + }, + { + "epoch": 0.7367461430575035, + "grad_norm": 1.901003888595272, + "learning_rate": 1.7099643012023032e-06, + "loss": 0.348, + "step": 5253 + }, + { + "epoch": 0.7368863955119215, + "grad_norm": 1.8538604512045354, + "learning_rate": 1.70825436308862e-06, + "loss": 0.3439, + "step": 5254 + }, + { + "epoch": 0.7370266479663394, + "grad_norm": 1.7297925467498292, + "learning_rate": 1.7065451042036507e-06, + "loss": 0.2956, + "step": 5255 + }, + { + "epoch": 0.7371669004207574, + "grad_norm": 2.131573774964797, + "learning_rate": 1.7048365249000897e-06, + "loss": 0.3678, + "step": 5256 + }, + { + "epoch": 0.7373071528751753, + "grad_norm": 1.8323315860230363, + "learning_rate": 1.7031286255304896e-06, + "loss": 0.3485, + "step": 5257 + }, + { + "epoch": 0.7374474053295933, + "grad_norm": 1.7956019660383182, + "learning_rate": 1.7014214064472646e-06, + "loss": 0.3945, + "step": 5258 + }, + { + "epoch": 0.7375876577840113, + "grad_norm": 2.5445091633106043, + "learning_rate": 1.6997148680026859e-06, + "loss": 0.3765, + "step": 5259 + }, + { + "epoch": 0.7377279102384292, + "grad_norm": 1.5487217523863663, + "learning_rate": 1.6980090105488866e-06, + "loss": 0.3528, + "step": 5260 + }, + { + "epoch": 0.7378681626928472, + "grad_norm": 1.7530471974888568, + "learning_rate": 1.696303834437859e-06, + "loss": 0.3232, + "step": 5261 + }, + { + "epoch": 0.7380084151472651, + "grad_norm": 2.042309277149941, + "learning_rate": 1.6945993400214534e-06, + "loss": 0.3309, + "step": 5262 + }, + { + "epoch": 0.7381486676016831, + "grad_norm": 1.724968290728605, + "learning_rate": 1.6928955276513826e-06, + "loss": 0.3227, + "step": 5263 + }, + { + "epoch": 0.738288920056101, + "grad_norm": 1.9871208237081424, + "learning_rate": 1.6911923976792123e-06, + "loss": 0.3791, + "step": 5264 + }, + { + "epoch": 0.738429172510519, + "grad_norm": 1.9357374849975357, + "learning_rate": 1.6894899504563738e-06, + "loss": 0.3228, + "step": 5265 + }, + { + "epoch": 0.7385694249649369, + "grad_norm": 3.682063864100731, + "learning_rate": 1.6877881863341567e-06, + "loss": 0.3669, + "step": 5266 + }, + { + "epoch": 0.7387096774193549, + "grad_norm": 1.8207032926142381, + "learning_rate": 1.686087105663704e-06, + "loss": 0.3111, + "step": 5267 + }, + { + "epoch": 0.7388499298737727, + "grad_norm": 1.4221648874415136, + "learning_rate": 1.6843867087960252e-06, + "loss": 0.3187, + "step": 5268 + }, + { + "epoch": 0.7389901823281907, + "grad_norm": 2.3056584785907606, + "learning_rate": 1.6826869960819835e-06, + "loss": 0.3723, + "step": 5269 + }, + { + "epoch": 0.7391304347826086, + "grad_norm": 2.465266123521746, + "learning_rate": 1.6809879678723045e-06, + "loss": 0.3204, + "step": 5270 + }, + { + "epoch": 0.7392706872370266, + "grad_norm": 1.7927755280808841, + "learning_rate": 1.6792896245175693e-06, + "loss": 0.3757, + "step": 5271 + }, + { + "epoch": 0.7394109396914446, + "grad_norm": 1.7148334615828895, + "learning_rate": 1.67759196636822e-06, + "loss": 0.3619, + "step": 5272 + }, + { + "epoch": 0.7395511921458625, + "grad_norm": 2.2879921555684737, + "learning_rate": 1.6758949937745562e-06, + "loss": 0.3612, + "step": 5273 + }, + { + "epoch": 0.7396914446002805, + "grad_norm": 1.735688545511626, + "learning_rate": 1.6741987070867377e-06, + "loss": 0.3205, + "step": 5274 + }, + { + "epoch": 0.7398316970546984, + "grad_norm": 1.7605154322018521, + "learning_rate": 1.6725031066547786e-06, + "loss": 0.34, + "step": 5275 + }, + { + "epoch": 0.7399719495091164, + "grad_norm": 2.233906410005426, + "learning_rate": 1.6708081928285558e-06, + "loss": 0.3648, + "step": 5276 + }, + { + "epoch": 0.7401122019635343, + "grad_norm": 1.636982607456062, + "learning_rate": 1.6691139659578032e-06, + "loss": 0.3373, + "step": 5277 + }, + { + "epoch": 0.7402524544179523, + "grad_norm": 1.933009327123472, + "learning_rate": 1.6674204263921118e-06, + "loss": 0.3531, + "step": 5278 + }, + { + "epoch": 0.7403927068723702, + "grad_norm": 2.1217639068151355, + "learning_rate": 1.6657275744809327e-06, + "loss": 0.3424, + "step": 5279 + }, + { + "epoch": 0.7405329593267882, + "grad_norm": 1.6153657391601572, + "learning_rate": 1.6640354105735728e-06, + "loss": 0.3141, + "step": 5280 + }, + { + "epoch": 0.7406732117812062, + "grad_norm": 2.1935849162965315, + "learning_rate": 1.6623439350191995e-06, + "loss": 0.373, + "step": 5281 + }, + { + "epoch": 0.7408134642356241, + "grad_norm": 2.073334068817466, + "learning_rate": 1.6606531481668364e-06, + "loss": 0.3, + "step": 5282 + }, + { + "epoch": 0.7409537166900421, + "grad_norm": 1.597045781806837, + "learning_rate": 1.658963050365367e-06, + "loss": 0.282, + "step": 5283 + }, + { + "epoch": 0.74109396914446, + "grad_norm": 3.201213880615952, + "learning_rate": 1.6572736419635288e-06, + "loss": 0.3288, + "step": 5284 + }, + { + "epoch": 0.741234221598878, + "grad_norm": 1.5715544993860078, + "learning_rate": 1.6555849233099202e-06, + "loss": 0.3531, + "step": 5285 + }, + { + "epoch": 0.7413744740532959, + "grad_norm": 1.4536333733023163, + "learning_rate": 1.6538968947529965e-06, + "loss": 0.3034, + "step": 5286 + }, + { + "epoch": 0.7415147265077139, + "grad_norm": 2.576253408164149, + "learning_rate": 1.6522095566410728e-06, + "loss": 0.3941, + "step": 5287 + }, + { + "epoch": 0.7416549789621318, + "grad_norm": 1.8442899915390842, + "learning_rate": 1.6505229093223158e-06, + "loss": 0.3493, + "step": 5288 + }, + { + "epoch": 0.7417952314165498, + "grad_norm": 2.3369810905360775, + "learning_rate": 1.648836953144755e-06, + "loss": 0.3351, + "step": 5289 + }, + { + "epoch": 0.7419354838709677, + "grad_norm": 2.0408650465910814, + "learning_rate": 1.647151688456276e-06, + "loss": 0.3637, + "step": 5290 + }, + { + "epoch": 0.7420757363253857, + "grad_norm": 1.755958902256073, + "learning_rate": 1.6454671156046214e-06, + "loss": 0.342, + "step": 5291 + }, + { + "epoch": 0.7422159887798037, + "grad_norm": 2.0912285040590293, + "learning_rate": 1.6437832349373906e-06, + "loss": 0.352, + "step": 5292 + }, + { + "epoch": 0.7423562412342216, + "grad_norm": 1.6744456982469045, + "learning_rate": 1.642100046802041e-06, + "loss": 0.2912, + "step": 5293 + }, + { + "epoch": 0.7424964936886396, + "grad_norm": 1.8811889232973702, + "learning_rate": 1.6404175515458882e-06, + "loss": 0.3338, + "step": 5294 + }, + { + "epoch": 0.7426367461430575, + "grad_norm": 2.6832431525325453, + "learning_rate": 1.6387357495161e-06, + "loss": 0.3602, + "step": 5295 + }, + { + "epoch": 0.7427769985974755, + "grad_norm": 1.6231402245219924, + "learning_rate": 1.6370546410597066e-06, + "loss": 0.3741, + "step": 5296 + }, + { + "epoch": 0.7429172510518934, + "grad_norm": 2.4813120452272415, + "learning_rate": 1.6353742265235923e-06, + "loss": 0.3498, + "step": 5297 + }, + { + "epoch": 0.7430575035063114, + "grad_norm": 1.7329304495897215, + "learning_rate": 1.633694506254499e-06, + "loss": 0.3174, + "step": 5298 + }, + { + "epoch": 0.7431977559607293, + "grad_norm": 2.5076551536563843, + "learning_rate": 1.6320154805990258e-06, + "loss": 0.3471, + "step": 5299 + }, + { + "epoch": 0.7433380084151473, + "grad_norm": 1.6935705561439403, + "learning_rate": 1.6303371499036275e-06, + "loss": 0.3315, + "step": 5300 + }, + { + "epoch": 0.7434782608695653, + "grad_norm": 1.733728305430638, + "learning_rate": 1.6286595145146162e-06, + "loss": 0.3413, + "step": 5301 + }, + { + "epoch": 0.7436185133239832, + "grad_norm": 3.5352889736825484, + "learning_rate": 1.6269825747781598e-06, + "loss": 0.3608, + "step": 5302 + }, + { + "epoch": 0.7437587657784012, + "grad_norm": 2.209918538831851, + "learning_rate": 1.6253063310402833e-06, + "loss": 0.3355, + "step": 5303 + }, + { + "epoch": 0.7438990182328191, + "grad_norm": 1.9843469452564386, + "learning_rate": 1.6236307836468695e-06, + "loss": 0.3554, + "step": 5304 + }, + { + "epoch": 0.7440392706872371, + "grad_norm": 2.044412310780819, + "learning_rate": 1.6219559329436528e-06, + "loss": 0.3666, + "step": 5305 + }, + { + "epoch": 0.744179523141655, + "grad_norm": 2.2105173311552866, + "learning_rate": 1.6202817792762283e-06, + "loss": 0.3257, + "step": 5306 + }, + { + "epoch": 0.744319775596073, + "grad_norm": 1.567990080227906, + "learning_rate": 1.6186083229900462e-06, + "loss": 0.3351, + "step": 5307 + }, + { + "epoch": 0.7444600280504908, + "grad_norm": 2.146632875784692, + "learning_rate": 1.616935564430414e-06, + "loss": 0.3148, + "step": 5308 + }, + { + "epoch": 0.7446002805049088, + "grad_norm": 1.6912904483895126, + "learning_rate": 1.6152635039424907e-06, + "loss": 0.3049, + "step": 5309 + }, + { + "epoch": 0.7447405329593267, + "grad_norm": 2.4312690910629287, + "learning_rate": 1.6135921418712959e-06, + "loss": 0.3218, + "step": 5310 + }, + { + "epoch": 0.7448807854137447, + "grad_norm": 1.957398340208964, + "learning_rate": 1.6119214785617027e-06, + "loss": 0.3293, + "step": 5311 + }, + { + "epoch": 0.7450210378681626, + "grad_norm": 1.6518775516739996, + "learning_rate": 1.6102515143584412e-06, + "loss": 0.2943, + "step": 5312 + }, + { + "epoch": 0.7451612903225806, + "grad_norm": 1.9106666803477559, + "learning_rate": 1.6085822496060976e-06, + "loss": 0.3489, + "step": 5313 + }, + { + "epoch": 0.7453015427769986, + "grad_norm": 1.9158735776736553, + "learning_rate": 1.6069136846491124e-06, + "loss": 0.3317, + "step": 5314 + }, + { + "epoch": 0.7454417952314165, + "grad_norm": 1.6038742129121992, + "learning_rate": 1.6052458198317844e-06, + "loss": 0.3135, + "step": 5315 + }, + { + "epoch": 0.7455820476858345, + "grad_norm": 1.9133885033835578, + "learning_rate": 1.6035786554982614e-06, + "loss": 0.3391, + "step": 5316 + }, + { + "epoch": 0.7457223001402524, + "grad_norm": 1.732591347646306, + "learning_rate": 1.601912191992554e-06, + "loss": 0.3646, + "step": 5317 + }, + { + "epoch": 0.7458625525946704, + "grad_norm": 1.6952019580716342, + "learning_rate": 1.6002464296585253e-06, + "loss": 0.3394, + "step": 5318 + }, + { + "epoch": 0.7460028050490883, + "grad_norm": 1.8672598980390895, + "learning_rate": 1.5985813688398927e-06, + "loss": 0.345, + "step": 5319 + }, + { + "epoch": 0.7461430575035063, + "grad_norm": 2.4239522577878194, + "learning_rate": 1.59691700988023e-06, + "loss": 0.3321, + "step": 5320 + }, + { + "epoch": 0.7462833099579242, + "grad_norm": 1.9655834951703797, + "learning_rate": 1.5952533531229675e-06, + "loss": 0.2737, + "step": 5321 + }, + { + "epoch": 0.7464235624123422, + "grad_norm": 1.937611730491857, + "learning_rate": 1.5935903989113877e-06, + "loss": 0.2932, + "step": 5322 + }, + { + "epoch": 0.7465638148667602, + "grad_norm": 1.5592507719961863, + "learning_rate": 1.59192814758863e-06, + "loss": 0.3791, + "step": 5323 + }, + { + "epoch": 0.7467040673211781, + "grad_norm": 2.201839207085718, + "learning_rate": 1.5902665994976896e-06, + "loss": 0.3694, + "step": 5324 + }, + { + "epoch": 0.7468443197755961, + "grad_norm": 1.7937604365560251, + "learning_rate": 1.5886057549814133e-06, + "loss": 0.3236, + "step": 5325 + }, + { + "epoch": 0.746984572230014, + "grad_norm": 2.326017308965358, + "learning_rate": 1.5869456143825051e-06, + "loss": 0.3245, + "step": 5326 + }, + { + "epoch": 0.747124824684432, + "grad_norm": 1.9614850985631718, + "learning_rate": 1.5852861780435237e-06, + "loss": 0.3429, + "step": 5327 + }, + { + "epoch": 0.7472650771388499, + "grad_norm": 2.180545323611778, + "learning_rate": 1.583627446306883e-06, + "loss": 0.3167, + "step": 5328 + }, + { + "epoch": 0.7474053295932679, + "grad_norm": 1.908928224827889, + "learning_rate": 1.581969419514851e-06, + "loss": 0.3454, + "step": 5329 + }, + { + "epoch": 0.7475455820476858, + "grad_norm": 2.8077578938926613, + "learning_rate": 1.5803120980095477e-06, + "loss": 0.3429, + "step": 5330 + }, + { + "epoch": 0.7476858345021038, + "grad_norm": 2.0328350045368087, + "learning_rate": 1.5786554821329515e-06, + "loss": 0.374, + "step": 5331 + }, + { + "epoch": 0.7478260869565218, + "grad_norm": 1.7823989910167393, + "learning_rate": 1.5769995722268926e-06, + "loss": 0.3371, + "step": 5332 + }, + { + "epoch": 0.7479663394109397, + "grad_norm": 2.0925052415788965, + "learning_rate": 1.5753443686330572e-06, + "loss": 0.3539, + "step": 5333 + }, + { + "epoch": 0.7481065918653577, + "grad_norm": 2.104408350998319, + "learning_rate": 1.5736898716929848e-06, + "loss": 0.3309, + "step": 5334 + }, + { + "epoch": 0.7482468443197756, + "grad_norm": 2.086807926604718, + "learning_rate": 1.5720360817480712e-06, + "loss": 0.3923, + "step": 5335 + }, + { + "epoch": 0.7483870967741936, + "grad_norm": 1.6954502454076055, + "learning_rate": 1.5703829991395602e-06, + "loss": 0.3676, + "step": 5336 + }, + { + "epoch": 0.7485273492286115, + "grad_norm": 2.176607642763364, + "learning_rate": 1.5687306242085565e-06, + "loss": 0.3342, + "step": 5337 + }, + { + "epoch": 0.7486676016830295, + "grad_norm": 2.6320449208249617, + "learning_rate": 1.567078957296016e-06, + "loss": 0.3215, + "step": 5338 + }, + { + "epoch": 0.7488078541374474, + "grad_norm": 2.035737582482139, + "learning_rate": 1.565427998742748e-06, + "loss": 0.3255, + "step": 5339 + }, + { + "epoch": 0.7489481065918654, + "grad_norm": 2.014725121995481, + "learning_rate": 1.5637777488894167e-06, + "loss": 0.3662, + "step": 5340 + }, + { + "epoch": 0.7490883590462833, + "grad_norm": 7.301237121685618, + "learning_rate": 1.5621282080765399e-06, + "loss": 0.2935, + "step": 5341 + }, + { + "epoch": 0.7492286115007013, + "grad_norm": 2.776033717580058, + "learning_rate": 1.5604793766444882e-06, + "loss": 0.3036, + "step": 5342 + }, + { + "epoch": 0.7493688639551193, + "grad_norm": 1.9263406989388097, + "learning_rate": 1.5588312549334867e-06, + "loss": 0.3158, + "step": 5343 + }, + { + "epoch": 0.7495091164095372, + "grad_norm": 2.193111415852494, + "learning_rate": 1.557183843283614e-06, + "loss": 0.3103, + "step": 5344 + }, + { + "epoch": 0.7496493688639552, + "grad_norm": 2.1123410317463924, + "learning_rate": 1.5555371420348031e-06, + "loss": 0.3598, + "step": 5345 + }, + { + "epoch": 0.7497896213183731, + "grad_norm": 1.7752991706281942, + "learning_rate": 1.5538911515268368e-06, + "loss": 0.336, + "step": 5346 + }, + { + "epoch": 0.7499298737727911, + "grad_norm": 1.6625836028682857, + "learning_rate": 1.552245872099355e-06, + "loss": 0.3422, + "step": 5347 + }, + { + "epoch": 0.7500701262272089, + "grad_norm": 3.631068313439282, + "learning_rate": 1.5506013040918494e-06, + "loss": 0.3201, + "step": 5348 + }, + { + "epoch": 0.7502103786816269, + "grad_norm": 2.0488197464676157, + "learning_rate": 1.5489574478436664e-06, + "loss": 0.3167, + "step": 5349 + }, + { + "epoch": 0.7503506311360448, + "grad_norm": 1.9815610623380784, + "learning_rate": 1.5473143036940026e-06, + "loss": 0.3339, + "step": 5350 + }, + { + "epoch": 0.7504908835904628, + "grad_norm": 2.07300102279498, + "learning_rate": 1.5456718719819092e-06, + "loss": 0.3842, + "step": 5351 + }, + { + "epoch": 0.7506311360448807, + "grad_norm": 2.3243686146865126, + "learning_rate": 1.544030153046291e-06, + "loss": 0.3448, + "step": 5352 + }, + { + "epoch": 0.7507713884992987, + "grad_norm": 1.9360414983542087, + "learning_rate": 1.5423891472259056e-06, + "loss": 0.3097, + "step": 5353 + }, + { + "epoch": 0.7509116409537167, + "grad_norm": 2.1254365136090443, + "learning_rate": 1.5407488548593629e-06, + "loss": 0.3691, + "step": 5354 + }, + { + "epoch": 0.7510518934081346, + "grad_norm": 1.819221005620609, + "learning_rate": 1.5391092762851257e-06, + "loss": 0.3068, + "step": 5355 + }, + { + "epoch": 0.7511921458625526, + "grad_norm": 2.178673511766741, + "learning_rate": 1.5374704118415112e-06, + "loss": 0.3107, + "step": 5356 + }, + { + "epoch": 0.7513323983169705, + "grad_norm": 2.5865565362898186, + "learning_rate": 1.535832261866685e-06, + "loss": 0.3368, + "step": 5357 + }, + { + "epoch": 0.7514726507713885, + "grad_norm": 2.1100044724114193, + "learning_rate": 1.5341948266986683e-06, + "loss": 0.3822, + "step": 5358 + }, + { + "epoch": 0.7516129032258064, + "grad_norm": 2.6812531371198323, + "learning_rate": 1.5325581066753354e-06, + "loss": 0.3218, + "step": 5359 + }, + { + "epoch": 0.7517531556802244, + "grad_norm": 1.8979995285130067, + "learning_rate": 1.5309221021344118e-06, + "loss": 0.3602, + "step": 5360 + }, + { + "epoch": 0.7518934081346423, + "grad_norm": 2.7624707526820673, + "learning_rate": 1.5292868134134754e-06, + "loss": 0.3446, + "step": 5361 + }, + { + "epoch": 0.7520336605890603, + "grad_norm": 2.4392638060482157, + "learning_rate": 1.5276522408499567e-06, + "loss": 0.3438, + "step": 5362 + }, + { + "epoch": 0.7521739130434782, + "grad_norm": 1.9799534354851513, + "learning_rate": 1.5260183847811383e-06, + "loss": 0.3452, + "step": 5363 + }, + { + "epoch": 0.7523141654978962, + "grad_norm": 2.0625130624068486, + "learning_rate": 1.5243852455441555e-06, + "loss": 0.3323, + "step": 5364 + }, + { + "epoch": 0.7524544179523142, + "grad_norm": 2.6449542249673907, + "learning_rate": 1.5227528234759958e-06, + "loss": 0.4118, + "step": 5365 + }, + { + "epoch": 0.7525946704067321, + "grad_norm": 1.704245802231899, + "learning_rate": 1.5211211189134955e-06, + "loss": 0.3415, + "step": 5366 + }, + { + "epoch": 0.7527349228611501, + "grad_norm": 2.211351874782759, + "learning_rate": 1.519490132193347e-06, + "loss": 0.3248, + "step": 5367 + }, + { + "epoch": 0.752875175315568, + "grad_norm": 1.7899428954152856, + "learning_rate": 1.517859863652093e-06, + "loss": 0.3516, + "step": 5368 + }, + { + "epoch": 0.753015427769986, + "grad_norm": 2.1069201843612975, + "learning_rate": 1.516230313626128e-06, + "loss": 0.3598, + "step": 5369 + }, + { + "epoch": 0.7531556802244039, + "grad_norm": 1.7663997199550696, + "learning_rate": 1.5146014824516997e-06, + "loss": 0.3405, + "step": 5370 + }, + { + "epoch": 0.7532959326788219, + "grad_norm": 1.931904372674142, + "learning_rate": 1.512973370464903e-06, + "loss": 0.3627, + "step": 5371 + }, + { + "epoch": 0.7534361851332398, + "grad_norm": 1.5642756481680389, + "learning_rate": 1.5113459780016887e-06, + "loss": 0.3233, + "step": 5372 + }, + { + "epoch": 0.7535764375876578, + "grad_norm": 1.614935518636846, + "learning_rate": 1.5097193053978587e-06, + "loss": 0.3385, + "step": 5373 + }, + { + "epoch": 0.7537166900420758, + "grad_norm": 1.8659110714164076, + "learning_rate": 1.5080933529890645e-06, + "loss": 0.3304, + "step": 5374 + }, + { + "epoch": 0.7538569424964937, + "grad_norm": 2.0682329205841015, + "learning_rate": 1.5064681211108112e-06, + "loss": 0.3691, + "step": 5375 + }, + { + "epoch": 0.7539971949509117, + "grad_norm": 2.0436490095192195, + "learning_rate": 1.5048436100984549e-06, + "loss": 0.3828, + "step": 5376 + }, + { + "epoch": 0.7541374474053296, + "grad_norm": 1.7451644575124283, + "learning_rate": 1.5032198202871983e-06, + "loss": 0.3441, + "step": 5377 + }, + { + "epoch": 0.7542776998597476, + "grad_norm": 1.7371855636506286, + "learning_rate": 1.5015967520121016e-06, + "loss": 0.2981, + "step": 5378 + }, + { + "epoch": 0.7544179523141655, + "grad_norm": 1.808180034602921, + "learning_rate": 1.4999744056080734e-06, + "loss": 0.3712, + "step": 5379 + }, + { + "epoch": 0.7545582047685835, + "grad_norm": 3.2144279720978624, + "learning_rate": 1.4983527814098736e-06, + "loss": 0.3377, + "step": 5380 + }, + { + "epoch": 0.7546984572230014, + "grad_norm": 1.9447988421800337, + "learning_rate": 1.496731879752113e-06, + "loss": 0.3222, + "step": 5381 + }, + { + "epoch": 0.7548387096774194, + "grad_norm": 2.0364857027079726, + "learning_rate": 1.4951117009692528e-06, + "loss": 0.3621, + "step": 5382 + }, + { + "epoch": 0.7549789621318374, + "grad_norm": 1.9322076935185653, + "learning_rate": 1.4934922453956064e-06, + "loss": 0.3724, + "step": 5383 + }, + { + "epoch": 0.7551192145862553, + "grad_norm": 1.7266682035790357, + "learning_rate": 1.4918735133653368e-06, + "loss": 0.3484, + "step": 5384 + }, + { + "epoch": 0.7552594670406733, + "grad_norm": 3.1612525801763116, + "learning_rate": 1.4902555052124579e-06, + "loss": 0.3484, + "step": 5385 + }, + { + "epoch": 0.7553997194950912, + "grad_norm": 1.8036256118362006, + "learning_rate": 1.4886382212708361e-06, + "loss": 0.329, + "step": 5386 + }, + { + "epoch": 0.7555399719495092, + "grad_norm": 1.5160553311439011, + "learning_rate": 1.4870216618741833e-06, + "loss": 0.3637, + "step": 5387 + }, + { + "epoch": 0.755680224403927, + "grad_norm": 1.8147913162861125, + "learning_rate": 1.4854058273560667e-06, + "loss": 0.3366, + "step": 5388 + }, + { + "epoch": 0.755820476858345, + "grad_norm": 3.5734487569253854, + "learning_rate": 1.4837907180499035e-06, + "loss": 0.3892, + "step": 5389 + }, + { + "epoch": 0.7559607293127629, + "grad_norm": 3.218964060359669, + "learning_rate": 1.4821763342889588e-06, + "loss": 0.3049, + "step": 5390 + }, + { + "epoch": 0.7561009817671809, + "grad_norm": 1.767840994999226, + "learning_rate": 1.480562676406352e-06, + "loss": 0.3388, + "step": 5391 + }, + { + "epoch": 0.7562412342215988, + "grad_norm": 2.327752512377683, + "learning_rate": 1.4789497447350465e-06, + "loss": 0.3283, + "step": 5392 + }, + { + "epoch": 0.7563814866760168, + "grad_norm": 2.197860596530687, + "learning_rate": 1.477337539607861e-06, + "loss": 0.3477, + "step": 5393 + }, + { + "epoch": 0.7565217391304347, + "grad_norm": 2.1448802003995846, + "learning_rate": 1.475726061357463e-06, + "loss": 0.319, + "step": 5394 + }, + { + "epoch": 0.7566619915848527, + "grad_norm": 2.002064592933341, + "learning_rate": 1.4741153103163696e-06, + "loss": 0.356, + "step": 5395 + }, + { + "epoch": 0.7568022440392707, + "grad_norm": 1.5318112026145136, + "learning_rate": 1.4725052868169482e-06, + "loss": 0.3525, + "step": 5396 + }, + { + "epoch": 0.7569424964936886, + "grad_norm": 1.7033552661586027, + "learning_rate": 1.4708959911914177e-06, + "loss": 0.3019, + "step": 5397 + }, + { + "epoch": 0.7570827489481066, + "grad_norm": 1.7214921750730936, + "learning_rate": 1.4692874237718413e-06, + "loss": 0.3316, + "step": 5398 + }, + { + "epoch": 0.7572230014025245, + "grad_norm": 1.8012813113771748, + "learning_rate": 1.4676795848901376e-06, + "loss": 0.3726, + "step": 5399 + }, + { + "epoch": 0.7573632538569425, + "grad_norm": 1.689158013648949, + "learning_rate": 1.466072474878073e-06, + "loss": 0.3254, + "step": 5400 + }, + { + "epoch": 0.7575035063113604, + "grad_norm": 1.8979264815656323, + "learning_rate": 1.4644660940672628e-06, + "loss": 0.3595, + "step": 5401 + }, + { + "epoch": 0.7576437587657784, + "grad_norm": 1.713991200886311, + "learning_rate": 1.4628604427891728e-06, + "loss": 0.3221, + "step": 5402 + }, + { + "epoch": 0.7577840112201963, + "grad_norm": 1.8760979934305082, + "learning_rate": 1.4612555213751185e-06, + "loss": 0.3552, + "step": 5403 + }, + { + "epoch": 0.7579242636746143, + "grad_norm": 1.818435290398036, + "learning_rate": 1.4596513301562636e-06, + "loss": 0.347, + "step": 5404 + }, + { + "epoch": 0.7580645161290323, + "grad_norm": 2.1214919585849428, + "learning_rate": 1.458047869463622e-06, + "loss": 0.3079, + "step": 5405 + }, + { + "epoch": 0.7582047685834502, + "grad_norm": 3.7373087703302756, + "learning_rate": 1.4564451396280577e-06, + "loss": 0.3423, + "step": 5406 + }, + { + "epoch": 0.7583450210378682, + "grad_norm": 1.8864824293153972, + "learning_rate": 1.4548431409802804e-06, + "loss": 0.3473, + "step": 5407 + }, + { + "epoch": 0.7584852734922861, + "grad_norm": 3.3069646398305133, + "learning_rate": 1.4532418738508525e-06, + "loss": 0.3285, + "step": 5408 + }, + { + "epoch": 0.7586255259467041, + "grad_norm": 1.8105686025951542, + "learning_rate": 1.4516413385701845e-06, + "loss": 0.3057, + "step": 5409 + }, + { + "epoch": 0.758765778401122, + "grad_norm": 1.8756810540858204, + "learning_rate": 1.4500415354685349e-06, + "loss": 0.352, + "step": 5410 + }, + { + "epoch": 0.75890603085554, + "grad_norm": 1.537404595493174, + "learning_rate": 1.4484424648760125e-06, + "loss": 0.2971, + "step": 5411 + }, + { + "epoch": 0.7590462833099579, + "grad_norm": 1.747487239654624, + "learning_rate": 1.4468441271225764e-06, + "loss": 0.3498, + "step": 5412 + }, + { + "epoch": 0.7591865357643759, + "grad_norm": 1.8081452244017502, + "learning_rate": 1.4452465225380285e-06, + "loss": 0.3418, + "step": 5413 + }, + { + "epoch": 0.7593267882187938, + "grad_norm": 1.5120138821595321, + "learning_rate": 1.4436496514520253e-06, + "loss": 0.2918, + "step": 5414 + }, + { + "epoch": 0.7594670406732118, + "grad_norm": 1.6813368395532586, + "learning_rate": 1.44205351419407e-06, + "loss": 0.3356, + "step": 5415 + }, + { + "epoch": 0.7596072931276298, + "grad_norm": 2.6252493058754, + "learning_rate": 1.440458111093514e-06, + "loss": 0.3125, + "step": 5416 + }, + { + "epoch": 0.7597475455820477, + "grad_norm": 1.7250163201928383, + "learning_rate": 1.4388634424795594e-06, + "loss": 0.3574, + "step": 5417 + }, + { + "epoch": 0.7598877980364657, + "grad_norm": 2.2932404287320045, + "learning_rate": 1.4372695086812522e-06, + "loss": 0.3402, + "step": 5418 + }, + { + "epoch": 0.7600280504908836, + "grad_norm": 1.7072148344367855, + "learning_rate": 1.4356763100274901e-06, + "loss": 0.3159, + "step": 5419 + }, + { + "epoch": 0.7601683029453016, + "grad_norm": 2.1409783732975862, + "learning_rate": 1.4340838468470198e-06, + "loss": 0.3343, + "step": 5420 + }, + { + "epoch": 0.7603085553997195, + "grad_norm": 2.289619143769733, + "learning_rate": 1.4324921194684337e-06, + "loss": 0.3197, + "step": 5421 + }, + { + "epoch": 0.7604488078541375, + "grad_norm": 1.8028124692126355, + "learning_rate": 1.430901128220174e-06, + "loss": 0.3423, + "step": 5422 + }, + { + "epoch": 0.7605890603085554, + "grad_norm": 1.7531063575229409, + "learning_rate": 1.4293108734305311e-06, + "loss": 0.3936, + "step": 5423 + }, + { + "epoch": 0.7607293127629734, + "grad_norm": 1.5460500347632529, + "learning_rate": 1.4277213554276426e-06, + "loss": 0.33, + "step": 5424 + }, + { + "epoch": 0.7608695652173914, + "grad_norm": 1.7624590686346935, + "learning_rate": 1.426132574539495e-06, + "loss": 0.3133, + "step": 5425 + }, + { + "epoch": 0.7610098176718093, + "grad_norm": 2.063618790021201, + "learning_rate": 1.424544531093921e-06, + "loss": 0.3811, + "step": 5426 + }, + { + "epoch": 0.7611500701262273, + "grad_norm": 2.428099085449811, + "learning_rate": 1.4229572254186047e-06, + "loss": 0.322, + "step": 5427 + }, + { + "epoch": 0.7612903225806451, + "grad_norm": 1.8664109917792424, + "learning_rate": 1.4213706578410718e-06, + "loss": 0.3192, + "step": 5428 + }, + { + "epoch": 0.7614305750350631, + "grad_norm": 2.239618085251511, + "learning_rate": 1.4197848286887017e-06, + "loss": 0.3379, + "step": 5429 + }, + { + "epoch": 0.761570827489481, + "grad_norm": 2.008103761438966, + "learning_rate": 1.4181997382887192e-06, + "loss": 0.3073, + "step": 5430 + }, + { + "epoch": 0.761711079943899, + "grad_norm": 1.9215041632427265, + "learning_rate": 1.416615386968196e-06, + "loss": 0.3484, + "step": 5431 + }, + { + "epoch": 0.7618513323983169, + "grad_norm": 3.3563778304152425, + "learning_rate": 1.4150317750540515e-06, + "loss": 0.3217, + "step": 5432 + }, + { + "epoch": 0.7619915848527349, + "grad_norm": 1.5837235806375916, + "learning_rate": 1.4134489028730557e-06, + "loss": 0.3321, + "step": 5433 + }, + { + "epoch": 0.7621318373071528, + "grad_norm": 2.1148224743948525, + "learning_rate": 1.4118667707518202e-06, + "loss": 0.3421, + "step": 5434 + }, + { + "epoch": 0.7622720897615708, + "grad_norm": 2.0722848386170423, + "learning_rate": 1.410285379016807e-06, + "loss": 0.3907, + "step": 5435 + }, + { + "epoch": 0.7624123422159887, + "grad_norm": 1.8033304768443914, + "learning_rate": 1.4087047279943267e-06, + "loss": 0.3124, + "step": 5436 + }, + { + "epoch": 0.7625525946704067, + "grad_norm": 1.7135278610399958, + "learning_rate": 1.4071248180105346e-06, + "loss": 0.3579, + "step": 5437 + }, + { + "epoch": 0.7626928471248247, + "grad_norm": 1.6952978036832402, + "learning_rate": 1.405545649391436e-06, + "loss": 0.3391, + "step": 5438 + }, + { + "epoch": 0.7628330995792426, + "grad_norm": 1.8522394286873611, + "learning_rate": 1.4039672224628786e-06, + "loss": 0.3389, + "step": 5439 + }, + { + "epoch": 0.7629733520336606, + "grad_norm": 2.011625239597013, + "learning_rate": 1.4023895375505608e-06, + "loss": 0.3134, + "step": 5440 + }, + { + "epoch": 0.7631136044880785, + "grad_norm": 2.2318195588450713, + "learning_rate": 1.4008125949800272e-06, + "loss": 0.3401, + "step": 5441 + }, + { + "epoch": 0.7632538569424965, + "grad_norm": 1.7545048768961482, + "learning_rate": 1.3992363950766686e-06, + "loss": 0.3342, + "step": 5442 + }, + { + "epoch": 0.7633941093969144, + "grad_norm": 1.935869823193459, + "learning_rate": 1.397660938165723e-06, + "loss": 0.2948, + "step": 5443 + }, + { + "epoch": 0.7635343618513324, + "grad_norm": 1.7942149500086986, + "learning_rate": 1.3960862245722746e-06, + "loss": 0.306, + "step": 5444 + }, + { + "epoch": 0.7636746143057503, + "grad_norm": 2.872970528174927, + "learning_rate": 1.3945122546212552e-06, + "loss": 0.3829, + "step": 5445 + }, + { + "epoch": 0.7638148667601683, + "grad_norm": 2.1523152815083586, + "learning_rate": 1.3929390286374416e-06, + "loss": 0.3348, + "step": 5446 + }, + { + "epoch": 0.7639551192145863, + "grad_norm": 1.6803437126599372, + "learning_rate": 1.3913665469454606e-06, + "loss": 0.3584, + "step": 5447 + }, + { + "epoch": 0.7640953716690042, + "grad_norm": 1.6866220380259067, + "learning_rate": 1.3897948098697789e-06, + "loss": 0.2842, + "step": 5448 + }, + { + "epoch": 0.7642356241234222, + "grad_norm": 1.4713265565515339, + "learning_rate": 1.3882238177347157e-06, + "loss": 0.3022, + "step": 5449 + }, + { + "epoch": 0.7643758765778401, + "grad_norm": 1.6232155024841535, + "learning_rate": 1.3866535708644335e-06, + "loss": 0.306, + "step": 5450 + }, + { + "epoch": 0.7645161290322581, + "grad_norm": 2.202802269206031, + "learning_rate": 1.385084069582942e-06, + "loss": 0.3549, + "step": 5451 + }, + { + "epoch": 0.764656381486676, + "grad_norm": 2.02787794304139, + "learning_rate": 1.3835153142140971e-06, + "loss": 0.3316, + "step": 5452 + }, + { + "epoch": 0.764796633941094, + "grad_norm": 1.6817068337267562, + "learning_rate": 1.3819473050816002e-06, + "loss": 0.3266, + "step": 5453 + }, + { + "epoch": 0.7649368863955119, + "grad_norm": 3.269731777518092, + "learning_rate": 1.380380042509001e-06, + "loss": 0.3315, + "step": 5454 + }, + { + "epoch": 0.7650771388499299, + "grad_norm": 1.8272502795349816, + "learning_rate": 1.3788135268196894e-06, + "loss": 0.3388, + "step": 5455 + }, + { + "epoch": 0.7652173913043478, + "grad_norm": 4.58989133120284, + "learning_rate": 1.377247758336907e-06, + "loss": 0.3488, + "step": 5456 + }, + { + "epoch": 0.7653576437587658, + "grad_norm": 1.67390518278175, + "learning_rate": 1.3756827373837396e-06, + "loss": 0.3118, + "step": 5457 + }, + { + "epoch": 0.7654978962131838, + "grad_norm": 1.865032468722799, + "learning_rate": 1.374118464283119e-06, + "loss": 0.3658, + "step": 5458 + }, + { + "epoch": 0.7656381486676017, + "grad_norm": 2.102835117444648, + "learning_rate": 1.3725549393578197e-06, + "loss": 0.3515, + "step": 5459 + }, + { + "epoch": 0.7657784011220197, + "grad_norm": 1.8440908989709814, + "learning_rate": 1.370992162930465e-06, + "loss": 0.3665, + "step": 5460 + }, + { + "epoch": 0.7659186535764376, + "grad_norm": 2.343159613955257, + "learning_rate": 1.3694301353235235e-06, + "loss": 0.3029, + "step": 5461 + }, + { + "epoch": 0.7660589060308556, + "grad_norm": 1.7596793201211025, + "learning_rate": 1.367868856859308e-06, + "loss": 0.3538, + "step": 5462 + }, + { + "epoch": 0.7661991584852735, + "grad_norm": 5.851081695972075, + "learning_rate": 1.3663083278599781e-06, + "loss": 0.3357, + "step": 5463 + }, + { + "epoch": 0.7663394109396915, + "grad_norm": 2.2776798896638333, + "learning_rate": 1.3647485486475376e-06, + "loss": 0.3601, + "step": 5464 + }, + { + "epoch": 0.7664796633941094, + "grad_norm": 2.001795558776411, + "learning_rate": 1.3631895195438361e-06, + "loss": 0.3195, + "step": 5465 + }, + { + "epoch": 0.7666199158485274, + "grad_norm": 1.63141429848353, + "learning_rate": 1.361631240870569e-06, + "loss": 0.3089, + "step": 5466 + }, + { + "epoch": 0.7667601683029454, + "grad_norm": 2.1079087424168583, + "learning_rate": 1.3600737129492752e-06, + "loss": 0.2987, + "step": 5467 + }, + { + "epoch": 0.7669004207573632, + "grad_norm": 1.713057297847868, + "learning_rate": 1.3585169361013418e-06, + "loss": 0.3096, + "step": 5468 + }, + { + "epoch": 0.7670406732117812, + "grad_norm": 2.298338754530892, + "learning_rate": 1.3569609106479958e-06, + "loss": 0.3496, + "step": 5469 + }, + { + "epoch": 0.7671809256661991, + "grad_norm": 3.03476959082916, + "learning_rate": 1.3554056369103136e-06, + "loss": 0.3447, + "step": 5470 + }, + { + "epoch": 0.7673211781206171, + "grad_norm": 1.9646292051729148, + "learning_rate": 1.353851115209215e-06, + "loss": 0.3268, + "step": 5471 + }, + { + "epoch": 0.767461430575035, + "grad_norm": 2.2836468789924873, + "learning_rate": 1.3522973458654648e-06, + "loss": 0.3571, + "step": 5472 + }, + { + "epoch": 0.767601683029453, + "grad_norm": 2.1972810329320884, + "learning_rate": 1.3507443291996724e-06, + "loss": 0.3171, + "step": 5473 + }, + { + "epoch": 0.7677419354838709, + "grad_norm": 3.004734594776448, + "learning_rate": 1.3491920655322931e-06, + "loss": 0.3265, + "step": 5474 + }, + { + "epoch": 0.7678821879382889, + "grad_norm": 2.0213523932231885, + "learning_rate": 1.3476405551836235e-06, + "loss": 0.32, + "step": 5475 + }, + { + "epoch": 0.7680224403927068, + "grad_norm": 1.7738492378502255, + "learning_rate": 1.346089798473808e-06, + "loss": 0.3216, + "step": 5476 + }, + { + "epoch": 0.7681626928471248, + "grad_norm": 1.584703087178074, + "learning_rate": 1.344539795722834e-06, + "loss": 0.3147, + "step": 5477 + }, + { + "epoch": 0.7683029453015428, + "grad_norm": 2.0027310083612844, + "learning_rate": 1.3429905472505344e-06, + "loss": 0.3669, + "step": 5478 + }, + { + "epoch": 0.7684431977559607, + "grad_norm": 2.46891339884, + "learning_rate": 1.341442053376587e-06, + "loss": 0.3335, + "step": 5479 + }, + { + "epoch": 0.7685834502103787, + "grad_norm": 1.8013489463467418, + "learning_rate": 1.3398943144205095e-06, + "loss": 0.2999, + "step": 5480 + }, + { + "epoch": 0.7687237026647966, + "grad_norm": 2.667523415345634, + "learning_rate": 1.3383473307016687e-06, + "loss": 0.3379, + "step": 5481 + }, + { + "epoch": 0.7688639551192146, + "grad_norm": 3.2722052027704627, + "learning_rate": 1.3368011025392735e-06, + "loss": 0.3931, + "step": 5482 + }, + { + "epoch": 0.7690042075736325, + "grad_norm": 1.802315676784272, + "learning_rate": 1.3352556302523783e-06, + "loss": 0.4007, + "step": 5483 + }, + { + "epoch": 0.7691444600280505, + "grad_norm": 2.174979865320776, + "learning_rate": 1.3337109141598798e-06, + "loss": 0.3909, + "step": 5484 + }, + { + "epoch": 0.7692847124824684, + "grad_norm": 2.122574150848066, + "learning_rate": 1.3321669545805188e-06, + "loss": 0.2839, + "step": 5485 + }, + { + "epoch": 0.7694249649368864, + "grad_norm": 1.7859068112811267, + "learning_rate": 1.3306237518328819e-06, + "loss": 0.359, + "step": 5486 + }, + { + "epoch": 0.7695652173913043, + "grad_norm": 2.1375147367270575, + "learning_rate": 1.3290813062353969e-06, + "loss": 0.3563, + "step": 5487 + }, + { + "epoch": 0.7697054698457223, + "grad_norm": 1.7818421835070155, + "learning_rate": 1.3275396181063394e-06, + "loss": 0.3427, + "step": 5488 + }, + { + "epoch": 0.7698457223001403, + "grad_norm": 2.4102401347841296, + "learning_rate": 1.325998687763822e-06, + "loss": 0.3353, + "step": 5489 + }, + { + "epoch": 0.7699859747545582, + "grad_norm": 1.8653207808847747, + "learning_rate": 1.324458515525807e-06, + "loss": 0.3522, + "step": 5490 + }, + { + "epoch": 0.7701262272089762, + "grad_norm": 1.6765169960523418, + "learning_rate": 1.3229191017100978e-06, + "loss": 0.3528, + "step": 5491 + }, + { + "epoch": 0.7702664796633941, + "grad_norm": 1.4296081187405678, + "learning_rate": 1.321380446634342e-06, + "loss": 0.3195, + "step": 5492 + }, + { + "epoch": 0.7704067321178121, + "grad_norm": 1.9597924525474038, + "learning_rate": 1.3198425506160302e-06, + "loss": 0.3932, + "step": 5493 + }, + { + "epoch": 0.77054698457223, + "grad_norm": 1.8015749293193855, + "learning_rate": 1.318305413972496e-06, + "loss": 0.3099, + "step": 5494 + }, + { + "epoch": 0.770687237026648, + "grad_norm": 2.2557407892938155, + "learning_rate": 1.316769037020919e-06, + "loss": 0.3475, + "step": 5495 + }, + { + "epoch": 0.7708274894810659, + "grad_norm": 2.054797731640889, + "learning_rate": 1.3152334200783167e-06, + "loss": 0.3427, + "step": 5496 + }, + { + "epoch": 0.7709677419354839, + "grad_norm": 1.7489541879786186, + "learning_rate": 1.3136985634615546e-06, + "loss": 0.3229, + "step": 5497 + }, + { + "epoch": 0.7711079943899019, + "grad_norm": 2.155146662064966, + "learning_rate": 1.312164467487339e-06, + "loss": 0.3391, + "step": 5498 + }, + { + "epoch": 0.7712482468443198, + "grad_norm": 1.6106437940762957, + "learning_rate": 1.310631132472222e-06, + "loss": 0.3271, + "step": 5499 + }, + { + "epoch": 0.7713884992987378, + "grad_norm": 2.7825938203662828, + "learning_rate": 1.3090985587325932e-06, + "loss": 0.3381, + "step": 5500 + }, + { + "epoch": 0.7715287517531557, + "grad_norm": 1.8502871470944398, + "learning_rate": 1.3075667465846904e-06, + "loss": 0.3343, + "step": 5501 + }, + { + "epoch": 0.7716690042075737, + "grad_norm": 1.9989665460807124, + "learning_rate": 1.306035696344592e-06, + "loss": 0.3074, + "step": 5502 + }, + { + "epoch": 0.7718092566619916, + "grad_norm": 2.0267723616009548, + "learning_rate": 1.3045054083282194e-06, + "loss": 0.3178, + "step": 5503 + }, + { + "epoch": 0.7719495091164096, + "grad_norm": 1.9101139368156632, + "learning_rate": 1.3029758828513368e-06, + "loss": 0.3373, + "step": 5504 + }, + { + "epoch": 0.7720897615708275, + "grad_norm": 3.0012562029042424, + "learning_rate": 1.3014471202295514e-06, + "loss": 0.3641, + "step": 5505 + }, + { + "epoch": 0.7722300140252455, + "grad_norm": 1.832290663582457, + "learning_rate": 1.2999191207783129e-06, + "loss": 0.3285, + "step": 5506 + }, + { + "epoch": 0.7723702664796634, + "grad_norm": 2.2269572660939287, + "learning_rate": 1.298391884812913e-06, + "loss": 0.3491, + "step": 5507 + }, + { + "epoch": 0.7725105189340813, + "grad_norm": 1.8252461254380226, + "learning_rate": 1.2968654126484858e-06, + "loss": 0.3228, + "step": 5508 + }, + { + "epoch": 0.7726507713884992, + "grad_norm": 1.9474375810524291, + "learning_rate": 1.2953397046000105e-06, + "loss": 0.293, + "step": 5509 + }, + { + "epoch": 0.7727910238429172, + "grad_norm": 1.71627622657958, + "learning_rate": 1.2938147609823026e-06, + "loss": 0.3438, + "step": 5510 + }, + { + "epoch": 0.7729312762973352, + "grad_norm": 1.607563814406419, + "learning_rate": 1.2922905821100256e-06, + "loss": 0.3258, + "step": 5511 + }, + { + "epoch": 0.7730715287517531, + "grad_norm": 2.0176060130877516, + "learning_rate": 1.2907671682976824e-06, + "loss": 0.3288, + "step": 5512 + }, + { + "epoch": 0.7732117812061711, + "grad_norm": 1.8301278253114919, + "learning_rate": 1.2892445198596198e-06, + "loss": 0.3284, + "step": 5513 + }, + { + "epoch": 0.773352033660589, + "grad_norm": 1.7395995209753552, + "learning_rate": 1.287722637110025e-06, + "loss": 0.3367, + "step": 5514 + }, + { + "epoch": 0.773492286115007, + "grad_norm": 1.694733903303401, + "learning_rate": 1.2862015203629274e-06, + "loss": 0.3728, + "step": 5515 + }, + { + "epoch": 0.7736325385694249, + "grad_norm": 1.901310508913826, + "learning_rate": 1.2846811699322014e-06, + "loss": 0.3275, + "step": 5516 + }, + { + "epoch": 0.7737727910238429, + "grad_norm": 1.9238296121504403, + "learning_rate": 1.2831615861315572e-06, + "loss": 0.3105, + "step": 5517 + }, + { + "epoch": 0.7739130434782608, + "grad_norm": 2.05338515864443, + "learning_rate": 1.281642769274552e-06, + "loss": 0.3272, + "step": 5518 + }, + { + "epoch": 0.7740532959326788, + "grad_norm": 1.8056059642830145, + "learning_rate": 1.2801247196745826e-06, + "loss": 0.3658, + "step": 5519 + }, + { + "epoch": 0.7741935483870968, + "grad_norm": 2.1669977986562663, + "learning_rate": 1.27860743764489e-06, + "loss": 0.3544, + "step": 5520 + }, + { + "epoch": 0.7743338008415147, + "grad_norm": 2.868992541330401, + "learning_rate": 1.2770909234985513e-06, + "loss": 0.305, + "step": 5521 + }, + { + "epoch": 0.7744740532959327, + "grad_norm": 1.7161912772182477, + "learning_rate": 1.2755751775484898e-06, + "loss": 0.3344, + "step": 5522 + }, + { + "epoch": 0.7746143057503506, + "grad_norm": 2.6487019724719976, + "learning_rate": 1.2740602001074697e-06, + "loss": 0.306, + "step": 5523 + }, + { + "epoch": 0.7747545582047686, + "grad_norm": 2.055987624847336, + "learning_rate": 1.2725459914880961e-06, + "loss": 0.3336, + "step": 5524 + }, + { + "epoch": 0.7748948106591865, + "grad_norm": 2.723068665803196, + "learning_rate": 1.271032552002815e-06, + "loss": 0.3227, + "step": 5525 + }, + { + "epoch": 0.7750350631136045, + "grad_norm": 1.7054352096915222, + "learning_rate": 1.2695198819639143e-06, + "loss": 0.2843, + "step": 5526 + }, + { + "epoch": 0.7751753155680224, + "grad_norm": 1.8316081194882392, + "learning_rate": 1.2680079816835228e-06, + "loss": 0.361, + "step": 5527 + }, + { + "epoch": 0.7753155680224404, + "grad_norm": 1.9786748040684015, + "learning_rate": 1.2664968514736104e-06, + "loss": 0.361, + "step": 5528 + }, + { + "epoch": 0.7754558204768583, + "grad_norm": 1.6685832765423594, + "learning_rate": 1.2649864916459897e-06, + "loss": 0.3705, + "step": 5529 + }, + { + "epoch": 0.7755960729312763, + "grad_norm": 4.315740722165435, + "learning_rate": 1.26347690251231e-06, + "loss": 0.3254, + "step": 5530 + }, + { + "epoch": 0.7757363253856943, + "grad_norm": 2.3362523507830457, + "learning_rate": 1.261968084384066e-06, + "loss": 0.3357, + "step": 5531 + }, + { + "epoch": 0.7758765778401122, + "grad_norm": 2.4577774346063768, + "learning_rate": 1.2604600375725922e-06, + "loss": 0.3464, + "step": 5532 + }, + { + "epoch": 0.7760168302945302, + "grad_norm": 2.615925407705269, + "learning_rate": 1.2589527623890629e-06, + "loss": 0.354, + "step": 5533 + }, + { + "epoch": 0.7761570827489481, + "grad_norm": 1.6176130683918148, + "learning_rate": 1.257446259144494e-06, + "loss": 0.3274, + "step": 5534 + }, + { + "epoch": 0.7762973352033661, + "grad_norm": 1.9333421331512302, + "learning_rate": 1.2559405281497427e-06, + "loss": 0.3042, + "step": 5535 + }, + { + "epoch": 0.776437587657784, + "grad_norm": 2.0073110941441827, + "learning_rate": 1.2544355697155048e-06, + "loss": 0.3683, + "step": 5536 + }, + { + "epoch": 0.776577840112202, + "grad_norm": 1.6912544305691197, + "learning_rate": 1.25293138415232e-06, + "loss": 0.3234, + "step": 5537 + }, + { + "epoch": 0.7767180925666199, + "grad_norm": 2.266898803417683, + "learning_rate": 1.2514279717705636e-06, + "loss": 0.3892, + "step": 5538 + }, + { + "epoch": 0.7768583450210379, + "grad_norm": 1.762106030308148, + "learning_rate": 1.249925332880455e-06, + "loss": 0.3596, + "step": 5539 + }, + { + "epoch": 0.7769985974754559, + "grad_norm": 1.6701505805558243, + "learning_rate": 1.248423467792056e-06, + "loss": 0.3632, + "step": 5540 + }, + { + "epoch": 0.7771388499298738, + "grad_norm": 1.9333835063650269, + "learning_rate": 1.2469223768152622e-06, + "loss": 0.3548, + "step": 5541 + }, + { + "epoch": 0.7772791023842918, + "grad_norm": 2.729945753487802, + "learning_rate": 1.245422060259815e-06, + "loss": 0.3187, + "step": 5542 + }, + { + "epoch": 0.7774193548387097, + "grad_norm": 1.92851623324979, + "learning_rate": 1.2439225184352938e-06, + "loss": 0.3581, + "step": 5543 + }, + { + "epoch": 0.7775596072931277, + "grad_norm": 1.5612339230909196, + "learning_rate": 1.242423751651119e-06, + "loss": 0.3155, + "step": 5544 + }, + { + "epoch": 0.7776998597475456, + "grad_norm": 2.013880958395357, + "learning_rate": 1.2409257602165509e-06, + "loss": 0.35, + "step": 5545 + }, + { + "epoch": 0.7778401122019636, + "grad_norm": 1.7712406213656107, + "learning_rate": 1.239428544440689e-06, + "loss": 0.3452, + "step": 5546 + }, + { + "epoch": 0.7779803646563815, + "grad_norm": 1.93062139607714, + "learning_rate": 1.2379321046324732e-06, + "loss": 0.3284, + "step": 5547 + }, + { + "epoch": 0.7781206171107994, + "grad_norm": 2.9576952250547257, + "learning_rate": 1.2364364411006841e-06, + "loss": 0.3022, + "step": 5548 + }, + { + "epoch": 0.7782608695652173, + "grad_norm": 1.6978101482406665, + "learning_rate": 1.2349415541539406e-06, + "loss": 0.3166, + "step": 5549 + }, + { + "epoch": 0.7784011220196353, + "grad_norm": 1.852199589699668, + "learning_rate": 1.2334474441007045e-06, + "loss": 0.3515, + "step": 5550 + }, + { + "epoch": 0.7785413744740532, + "grad_norm": 1.800886304194853, + "learning_rate": 1.2319541112492717e-06, + "loss": 0.3671, + "step": 5551 + }, + { + "epoch": 0.7786816269284712, + "grad_norm": 1.7221987715871248, + "learning_rate": 1.230461555907782e-06, + "loss": 0.3233, + "step": 5552 + }, + { + "epoch": 0.7788218793828892, + "grad_norm": 1.8208669710289334, + "learning_rate": 1.2289697783842142e-06, + "loss": 0.3135, + "step": 5553 + }, + { + "epoch": 0.7789621318373071, + "grad_norm": 1.878421966889591, + "learning_rate": 1.2274787789863862e-06, + "loss": 0.3945, + "step": 5554 + }, + { + "epoch": 0.7791023842917251, + "grad_norm": 2.224133689333846, + "learning_rate": 1.2259885580219555e-06, + "loss": 0.3247, + "step": 5555 + }, + { + "epoch": 0.779242636746143, + "grad_norm": 2.182730461441235, + "learning_rate": 1.224499115798418e-06, + "loss": 0.3549, + "step": 5556 + }, + { + "epoch": 0.779382889200561, + "grad_norm": 2.467984237694756, + "learning_rate": 1.2230104526231107e-06, + "loss": 0.3576, + "step": 5557 + }, + { + "epoch": 0.7795231416549789, + "grad_norm": 2.0174426968554062, + "learning_rate": 1.22152256880321e-06, + "loss": 0.3579, + "step": 5558 + }, + { + "epoch": 0.7796633941093969, + "grad_norm": 1.8609788532931197, + "learning_rate": 1.220035464645727e-06, + "loss": 0.3449, + "step": 5559 + }, + { + "epoch": 0.7798036465638148, + "grad_norm": 1.7594371832094933, + "learning_rate": 1.2185491404575166e-06, + "loss": 0.3438, + "step": 5560 + }, + { + "epoch": 0.7799438990182328, + "grad_norm": 1.6770595130255492, + "learning_rate": 1.2170635965452737e-06, + "loss": 0.2762, + "step": 5561 + }, + { + "epoch": 0.7800841514726508, + "grad_norm": 1.8939585026441734, + "learning_rate": 1.215578833215526e-06, + "loss": 0.332, + "step": 5562 + }, + { + "epoch": 0.7802244039270687, + "grad_norm": 2.0241806256360118, + "learning_rate": 1.2140948507746465e-06, + "loss": 0.3049, + "step": 5563 + }, + { + "epoch": 0.7803646563814867, + "grad_norm": 3.0051009998073224, + "learning_rate": 1.2126116495288436e-06, + "loss": 0.318, + "step": 5564 + }, + { + "epoch": 0.7805049088359046, + "grad_norm": 2.2001015769743737, + "learning_rate": 1.2111292297841666e-06, + "loss": 0.3462, + "step": 5565 + }, + { + "epoch": 0.7806451612903226, + "grad_norm": 2.286968432089739, + "learning_rate": 1.2096475918465016e-06, + "loss": 0.4091, + "step": 5566 + }, + { + "epoch": 0.7807854137447405, + "grad_norm": 1.8284229133635672, + "learning_rate": 1.2081667360215743e-06, + "loss": 0.3726, + "step": 5567 + }, + { + "epoch": 0.7809256661991585, + "grad_norm": 1.5242776754731406, + "learning_rate": 1.2066866626149499e-06, + "loss": 0.3384, + "step": 5568 + }, + { + "epoch": 0.7810659186535764, + "grad_norm": 2.099572469697456, + "learning_rate": 1.2052073719320296e-06, + "loss": 0.3417, + "step": 5569 + }, + { + "epoch": 0.7812061711079944, + "grad_norm": 1.8500487501577372, + "learning_rate": 1.2037288642780575e-06, + "loss": 0.2819, + "step": 5570 + }, + { + "epoch": 0.7813464235624124, + "grad_norm": 2.0148341083628725, + "learning_rate": 1.20225113995811e-06, + "loss": 0.3398, + "step": 5571 + }, + { + "epoch": 0.7814866760168303, + "grad_norm": 1.8447730895072216, + "learning_rate": 1.2007741992771065e-06, + "loss": 0.2809, + "step": 5572 + }, + { + "epoch": 0.7816269284712483, + "grad_norm": 3.9497060352502915, + "learning_rate": 1.1992980425398033e-06, + "loss": 0.3209, + "step": 5573 + }, + { + "epoch": 0.7817671809256662, + "grad_norm": 2.1555340449034417, + "learning_rate": 1.1978226700507956e-06, + "loss": 0.3177, + "step": 5574 + }, + { + "epoch": 0.7819074333800842, + "grad_norm": 2.1044830109646218, + "learning_rate": 1.1963480821145157e-06, + "loss": 0.3329, + "step": 5575 + }, + { + "epoch": 0.7820476858345021, + "grad_norm": 1.6940213397820114, + "learning_rate": 1.1948742790352342e-06, + "loss": 0.3424, + "step": 5576 + }, + { + "epoch": 0.7821879382889201, + "grad_norm": 1.9048425450171396, + "learning_rate": 1.193401261117061e-06, + "loss": 0.3148, + "step": 5577 + }, + { + "epoch": 0.782328190743338, + "grad_norm": 1.6003253159012631, + "learning_rate": 1.1919290286639424e-06, + "loss": 0.3289, + "step": 5578 + }, + { + "epoch": 0.782468443197756, + "grad_norm": 2.3899296765376103, + "learning_rate": 1.1904575819796648e-06, + "loss": 0.3624, + "step": 5579 + }, + { + "epoch": 0.782608695652174, + "grad_norm": 1.9761375603728804, + "learning_rate": 1.1889869213678485e-06, + "loss": 0.3546, + "step": 5580 + }, + { + "epoch": 0.7827489481065919, + "grad_norm": 2.185402436720707, + "learning_rate": 1.1875170471319565e-06, + "loss": 0.322, + "step": 5581 + }, + { + "epoch": 0.7828892005610099, + "grad_norm": 2.0770793377816488, + "learning_rate": 1.1860479595752838e-06, + "loss": 0.3157, + "step": 5582 + }, + { + "epoch": 0.7830294530154278, + "grad_norm": 1.8057938524456543, + "learning_rate": 1.1845796590009684e-06, + "loss": 0.3234, + "step": 5583 + }, + { + "epoch": 0.7831697054698458, + "grad_norm": 2.325188442510063, + "learning_rate": 1.1831121457119842e-06, + "loss": 0.3738, + "step": 5584 + }, + { + "epoch": 0.7833099579242637, + "grad_norm": 1.6443400147216793, + "learning_rate": 1.1816454200111415e-06, + "loss": 0.3666, + "step": 5585 + }, + { + "epoch": 0.7834502103786817, + "grad_norm": 1.802549540755552, + "learning_rate": 1.1801794822010893e-06, + "loss": 0.3348, + "step": 5586 + }, + { + "epoch": 0.7835904628330996, + "grad_norm": 2.019375409545668, + "learning_rate": 1.1787143325843131e-06, + "loss": 0.3245, + "step": 5587 + }, + { + "epoch": 0.7837307152875175, + "grad_norm": 2.753198324346358, + "learning_rate": 1.1772499714631375e-06, + "loss": 0.3683, + "step": 5588 + }, + { + "epoch": 0.7838709677419354, + "grad_norm": 2.2299409632715648, + "learning_rate": 1.1757863991397222e-06, + "loss": 0.3288, + "step": 5589 + }, + { + "epoch": 0.7840112201963534, + "grad_norm": 1.6155406287942473, + "learning_rate": 1.1743236159160654e-06, + "loss": 0.3569, + "step": 5590 + }, + { + "epoch": 0.7841514726507713, + "grad_norm": 2.511121331768935, + "learning_rate": 1.172861622094003e-06, + "loss": 0.2925, + "step": 5591 + }, + { + "epoch": 0.7842917251051893, + "grad_norm": 2.2419204818753053, + "learning_rate": 1.1714004179752058e-06, + "loss": 0.3356, + "step": 5592 + }, + { + "epoch": 0.7844319775596073, + "grad_norm": 11.546187157075314, + "learning_rate": 1.169940003861183e-06, + "loss": 0.3551, + "step": 5593 + }, + { + "epoch": 0.7845722300140252, + "grad_norm": 2.117761400430089, + "learning_rate": 1.1684803800532819e-06, + "loss": 0.3109, + "step": 5594 + }, + { + "epoch": 0.7847124824684432, + "grad_norm": 3.1172918240100715, + "learning_rate": 1.1670215468526852e-06, + "loss": 0.3427, + "step": 5595 + }, + { + "epoch": 0.7848527349228611, + "grad_norm": 2.364279896988707, + "learning_rate": 1.165563504560413e-06, + "loss": 0.3161, + "step": 5596 + }, + { + "epoch": 0.7849929873772791, + "grad_norm": 1.7774277216598986, + "learning_rate": 1.1641062534773218e-06, + "loss": 0.3428, + "step": 5597 + }, + { + "epoch": 0.785133239831697, + "grad_norm": 3.781100561798781, + "learning_rate": 1.162649793904106e-06, + "loss": 0.3211, + "step": 5598 + }, + { + "epoch": 0.785273492286115, + "grad_norm": 1.805575527993617, + "learning_rate": 1.1611941261412962e-06, + "loss": 0.3636, + "step": 5599 + }, + { + "epoch": 0.7854137447405329, + "grad_norm": 2.047779378496523, + "learning_rate": 1.1597392504892574e-06, + "loss": 0.3108, + "step": 5600 + }, + { + "epoch": 0.7855539971949509, + "grad_norm": 1.9605607966574607, + "learning_rate": 1.1582851672481943e-06, + "loss": 0.3181, + "step": 5601 + }, + { + "epoch": 0.7856942496493688, + "grad_norm": 4.769938933423453, + "learning_rate": 1.156831876718148e-06, + "loss": 0.3327, + "step": 5602 + }, + { + "epoch": 0.7858345021037868, + "grad_norm": 3.845098119508138, + "learning_rate": 1.1553793791989914e-06, + "loss": 0.3603, + "step": 5603 + }, + { + "epoch": 0.7859747545582048, + "grad_norm": 2.0993012809927225, + "learning_rate": 1.15392767499044e-06, + "loss": 0.3579, + "step": 5604 + }, + { + "epoch": 0.7861150070126227, + "grad_norm": 1.629328025898715, + "learning_rate": 1.1524767643920415e-06, + "loss": 0.322, + "step": 5605 + }, + { + "epoch": 0.7862552594670407, + "grad_norm": 2.5661377555875395, + "learning_rate": 1.1510266477031823e-06, + "loss": 0.3405, + "step": 5606 + }, + { + "epoch": 0.7863955119214586, + "grad_norm": 1.7576094446521446, + "learning_rate": 1.149577325223083e-06, + "loss": 0.3097, + "step": 5607 + }, + { + "epoch": 0.7865357643758766, + "grad_norm": 1.740294724091247, + "learning_rate": 1.148128797250801e-06, + "loss": 0.2985, + "step": 5608 + }, + { + "epoch": 0.7866760168302945, + "grad_norm": 2.172606969203831, + "learning_rate": 1.146681064085231e-06, + "loss": 0.3644, + "step": 5609 + }, + { + "epoch": 0.7868162692847125, + "grad_norm": 1.7255453850860665, + "learning_rate": 1.145234126025102e-06, + "loss": 0.3155, + "step": 5610 + }, + { + "epoch": 0.7869565217391304, + "grad_norm": 2.1474584949821263, + "learning_rate": 1.1437879833689808e-06, + "loss": 0.3472, + "step": 5611 + }, + { + "epoch": 0.7870967741935484, + "grad_norm": 2.518843398717106, + "learning_rate": 1.1423426364152663e-06, + "loss": 0.3506, + "step": 5612 + }, + { + "epoch": 0.7872370266479664, + "grad_norm": 2.0227274587862984, + "learning_rate": 1.1408980854621965e-06, + "loss": 0.3031, + "step": 5613 + }, + { + "epoch": 0.7873772791023843, + "grad_norm": 2.5066460959963477, + "learning_rate": 1.1394543308078454e-06, + "loss": 0.347, + "step": 5614 + }, + { + "epoch": 0.7875175315568023, + "grad_norm": 2.059096099537695, + "learning_rate": 1.1380113727501213e-06, + "loss": 0.3451, + "step": 5615 + }, + { + "epoch": 0.7876577840112202, + "grad_norm": 1.7545830115495633, + "learning_rate": 1.1365692115867682e-06, + "loss": 0.3061, + "step": 5616 + }, + { + "epoch": 0.7877980364656382, + "grad_norm": 2.3832536132746696, + "learning_rate": 1.1351278476153665e-06, + "loss": 0.3662, + "step": 5617 + }, + { + "epoch": 0.7879382889200561, + "grad_norm": 2.0026887463259597, + "learning_rate": 1.133687281133331e-06, + "loss": 0.3293, + "step": 5618 + }, + { + "epoch": 0.7880785413744741, + "grad_norm": 2.6352229191624756, + "learning_rate": 1.1322475124379134e-06, + "loss": 0.3689, + "step": 5619 + }, + { + "epoch": 0.788218793828892, + "grad_norm": 2.0457165184259978, + "learning_rate": 1.1308085418262004e-06, + "loss": 0.3337, + "step": 5620 + }, + { + "epoch": 0.78835904628331, + "grad_norm": 1.9130572735059104, + "learning_rate": 1.1293703695951109e-06, + "loss": 0.4108, + "step": 5621 + }, + { + "epoch": 0.788499298737728, + "grad_norm": 1.7077376870175092, + "learning_rate": 1.1279329960414047e-06, + "loss": 0.3654, + "step": 5622 + }, + { + "epoch": 0.7886395511921459, + "grad_norm": 1.7246010111587606, + "learning_rate": 1.1264964214616715e-06, + "loss": 0.3586, + "step": 5623 + }, + { + "epoch": 0.7887798036465639, + "grad_norm": 2.0780491635818983, + "learning_rate": 1.1250606461523389e-06, + "loss": 0.3335, + "step": 5624 + }, + { + "epoch": 0.7889200561009818, + "grad_norm": 1.964810259251106, + "learning_rate": 1.1236256704096693e-06, + "loss": 0.3336, + "step": 5625 + }, + { + "epoch": 0.7890603085553998, + "grad_norm": 1.9218862889896469, + "learning_rate": 1.1221914945297601e-06, + "loss": 0.3232, + "step": 5626 + }, + { + "epoch": 0.7892005610098177, + "grad_norm": 1.8349141616909292, + "learning_rate": 1.1207581188085436e-06, + "loss": 0.357, + "step": 5627 + }, + { + "epoch": 0.7893408134642356, + "grad_norm": 2.2783764024117907, + "learning_rate": 1.119325543541787e-06, + "loss": 0.3744, + "step": 5628 + }, + { + "epoch": 0.7894810659186535, + "grad_norm": 1.8179069636410412, + "learning_rate": 1.1178937690250917e-06, + "loss": 0.3316, + "step": 5629 + }, + { + "epoch": 0.7896213183730715, + "grad_norm": 2.50935330696278, + "learning_rate": 1.1164627955538948e-06, + "loss": 0.3283, + "step": 5630 + }, + { + "epoch": 0.7897615708274894, + "grad_norm": 2.3333185626879436, + "learning_rate": 1.1150326234234675e-06, + "loss": 0.3277, + "step": 5631 + }, + { + "epoch": 0.7899018232819074, + "grad_norm": 1.7401253851295573, + "learning_rate": 1.113603252928917e-06, + "loss": 0.3513, + "step": 5632 + }, + { + "epoch": 0.7900420757363253, + "grad_norm": 2.107094216882188, + "learning_rate": 1.1121746843651815e-06, + "loss": 0.3603, + "step": 5633 + }, + { + "epoch": 0.7901823281907433, + "grad_norm": 1.851664286845704, + "learning_rate": 1.1107469180270375e-06, + "loss": 0.3486, + "step": 5634 + }, + { + "epoch": 0.7903225806451613, + "grad_norm": 2.134160548505115, + "learning_rate": 1.1093199542090944e-06, + "loss": 0.3532, + "step": 5635 + }, + { + "epoch": 0.7904628330995792, + "grad_norm": 1.7084402035415662, + "learning_rate": 1.107893793205796e-06, + "loss": 0.3418, + "step": 5636 + }, + { + "epoch": 0.7906030855539972, + "grad_norm": 2.1783879825982035, + "learning_rate": 1.1064684353114213e-06, + "loss": 0.356, + "step": 5637 + }, + { + "epoch": 0.7907433380084151, + "grad_norm": 1.9902299426088401, + "learning_rate": 1.1050438808200824e-06, + "loss": 0.2903, + "step": 5638 + }, + { + "epoch": 0.7908835904628331, + "grad_norm": 2.097792864280986, + "learning_rate": 1.1036201300257266e-06, + "loss": 0.3328, + "step": 5639 + }, + { + "epoch": 0.791023842917251, + "grad_norm": 1.5702810711348787, + "learning_rate": 1.1021971832221345e-06, + "loss": 0.3274, + "step": 5640 + }, + { + "epoch": 0.791164095371669, + "grad_norm": 1.6669152392353133, + "learning_rate": 1.1007750407029232e-06, + "loss": 0.3128, + "step": 5641 + }, + { + "epoch": 0.7913043478260869, + "grad_norm": 2.0804641293837163, + "learning_rate": 1.0993537027615387e-06, + "loss": 0.3225, + "step": 5642 + }, + { + "epoch": 0.7914446002805049, + "grad_norm": 1.9666110332615956, + "learning_rate": 1.0979331696912666e-06, + "loss": 0.2962, + "step": 5643 + }, + { + "epoch": 0.7915848527349229, + "grad_norm": 3.2575056997323983, + "learning_rate": 1.0965134417852213e-06, + "loss": 0.3701, + "step": 5644 + }, + { + "epoch": 0.7917251051893408, + "grad_norm": 2.0197738413215154, + "learning_rate": 1.095094519336356e-06, + "loss": 0.3645, + "step": 5645 + }, + { + "epoch": 0.7918653576437588, + "grad_norm": 2.9561071123427776, + "learning_rate": 1.0936764026374547e-06, + "loss": 0.3345, + "step": 5646 + }, + { + "epoch": 0.7920056100981767, + "grad_norm": 1.9860582796671338, + "learning_rate": 1.0922590919811355e-06, + "loss": 0.3453, + "step": 5647 + }, + { + "epoch": 0.7921458625525947, + "grad_norm": 1.863408050381167, + "learning_rate": 1.0908425876598512e-06, + "loss": 0.3616, + "step": 5648 + }, + { + "epoch": 0.7922861150070126, + "grad_norm": 1.9969779667104437, + "learning_rate": 1.0894268899658877e-06, + "loss": 0.3084, + "step": 5649 + }, + { + "epoch": 0.7924263674614306, + "grad_norm": 2.6149735381208012, + "learning_rate": 1.088011999191364e-06, + "loss": 0.3348, + "step": 5650 + }, + { + "epoch": 0.7925666199158485, + "grad_norm": 3.044535986211674, + "learning_rate": 1.0865979156282325e-06, + "loss": 0.3725, + "step": 5651 + }, + { + "epoch": 0.7927068723702665, + "grad_norm": 2.4961841071404813, + "learning_rate": 1.085184639568282e-06, + "loss": 0.3684, + "step": 5652 + }, + { + "epoch": 0.7928471248246844, + "grad_norm": 1.6559067856358471, + "learning_rate": 1.083772171303128e-06, + "loss": 0.3508, + "step": 5653 + }, + { + "epoch": 0.7929873772791024, + "grad_norm": 1.9758185995907542, + "learning_rate": 1.0823605111242259e-06, + "loss": 0.3484, + "step": 5654 + }, + { + "epoch": 0.7931276297335204, + "grad_norm": 2.2113540310968727, + "learning_rate": 1.0809496593228614e-06, + "loss": 0.3343, + "step": 5655 + }, + { + "epoch": 0.7932678821879383, + "grad_norm": 1.7747452405324897, + "learning_rate": 1.079539616190154e-06, + "loss": 0.3602, + "step": 5656 + }, + { + "epoch": 0.7934081346423563, + "grad_norm": 1.579490146704042, + "learning_rate": 1.0781303820170563e-06, + "loss": 0.3275, + "step": 5657 + }, + { + "epoch": 0.7935483870967742, + "grad_norm": 1.6870645031765732, + "learning_rate": 1.0767219570943543e-06, + "loss": 0.3451, + "step": 5658 + }, + { + "epoch": 0.7936886395511922, + "grad_norm": 2.344472443180921, + "learning_rate": 1.075314341712666e-06, + "loss": 0.3459, + "step": 5659 + }, + { + "epoch": 0.7938288920056101, + "grad_norm": 2.7627789352236944, + "learning_rate": 1.073907536162443e-06, + "loss": 0.3474, + "step": 5660 + }, + { + "epoch": 0.7939691444600281, + "grad_norm": 1.8440318134691451, + "learning_rate": 1.0725015407339718e-06, + "loss": 0.3138, + "step": 5661 + }, + { + "epoch": 0.794109396914446, + "grad_norm": 2.03676841567004, + "learning_rate": 1.0710963557173664e-06, + "loss": 0.3172, + "step": 5662 + }, + { + "epoch": 0.794249649368864, + "grad_norm": 1.8063233462954715, + "learning_rate": 1.0696919814025803e-06, + "loss": 0.3441, + "step": 5663 + }, + { + "epoch": 0.794389901823282, + "grad_norm": 1.8511881389495108, + "learning_rate": 1.0682884180793923e-06, + "loss": 0.3081, + "step": 5664 + }, + { + "epoch": 0.7945301542776999, + "grad_norm": 1.7153049969626335, + "learning_rate": 1.066885666037421e-06, + "loss": 0.3463, + "step": 5665 + }, + { + "epoch": 0.7946704067321179, + "grad_norm": 1.7254733279346333, + "learning_rate": 1.0654837255661131e-06, + "loss": 0.3742, + "step": 5666 + }, + { + "epoch": 0.7948106591865358, + "grad_norm": 1.6485211681086387, + "learning_rate": 1.0640825969547498e-06, + "loss": 0.3085, + "step": 5667 + }, + { + "epoch": 0.7949509116409537, + "grad_norm": 1.564319390570846, + "learning_rate": 1.062682280492444e-06, + "loss": 0.2908, + "step": 5668 + }, + { + "epoch": 0.7950911640953716, + "grad_norm": 2.405887824211006, + "learning_rate": 1.0612827764681417e-06, + "loss": 0.3344, + "step": 5669 + }, + { + "epoch": 0.7952314165497896, + "grad_norm": 1.905344388374912, + "learning_rate": 1.0598840851706204e-06, + "loss": 0.3551, + "step": 5670 + }, + { + "epoch": 0.7953716690042075, + "grad_norm": 2.0632993131841375, + "learning_rate": 1.05848620688849e-06, + "loss": 0.3272, + "step": 5671 + }, + { + "epoch": 0.7955119214586255, + "grad_norm": 2.3569699591436595, + "learning_rate": 1.0570891419101931e-06, + "loss": 0.3477, + "step": 5672 + }, + { + "epoch": 0.7956521739130434, + "grad_norm": 2.2150500973446037, + "learning_rate": 1.055692890524006e-06, + "loss": 0.3683, + "step": 5673 + }, + { + "epoch": 0.7957924263674614, + "grad_norm": 2.319900013643524, + "learning_rate": 1.0542974530180327e-06, + "loss": 0.3716, + "step": 5674 + }, + { + "epoch": 0.7959326788218793, + "grad_norm": 3.262861771092343, + "learning_rate": 1.0529028296802129e-06, + "loss": 0.3264, + "step": 5675 + }, + { + "epoch": 0.7960729312762973, + "grad_norm": 6.472918802667598, + "learning_rate": 1.0515090207983175e-06, + "loss": 0.3143, + "step": 5676 + }, + { + "epoch": 0.7962131837307153, + "grad_norm": 1.770896401871693, + "learning_rate": 1.0501160266599492e-06, + "loss": 0.3187, + "step": 5677 + }, + { + "epoch": 0.7963534361851332, + "grad_norm": 1.7966566533651267, + "learning_rate": 1.048723847552543e-06, + "loss": 0.3789, + "step": 5678 + }, + { + "epoch": 0.7964936886395512, + "grad_norm": 1.942745000443771, + "learning_rate": 1.0473324837633653e-06, + "loss": 0.3173, + "step": 5679 + }, + { + "epoch": 0.7966339410939691, + "grad_norm": 2.0303323873867263, + "learning_rate": 1.0459419355795137e-06, + "loss": 0.3209, + "step": 5680 + }, + { + "epoch": 0.7967741935483871, + "grad_norm": 1.6179107670361652, + "learning_rate": 1.0445522032879184e-06, + "loss": 0.3575, + "step": 5681 + }, + { + "epoch": 0.796914446002805, + "grad_norm": 2.084925373855064, + "learning_rate": 1.0431632871753421e-06, + "loss": 0.3222, + "step": 5682 + }, + { + "epoch": 0.797054698457223, + "grad_norm": 2.0772841050105995, + "learning_rate": 1.041775187528376e-06, + "loss": 0.3361, + "step": 5683 + }, + { + "epoch": 0.7971949509116409, + "grad_norm": 1.5633805654720567, + "learning_rate": 1.040387904633447e-06, + "loss": 0.3255, + "step": 5684 + }, + { + "epoch": 0.7973352033660589, + "grad_norm": 1.7530008200661482, + "learning_rate": 1.0390014387768083e-06, + "loss": 0.353, + "step": 5685 + }, + { + "epoch": 0.7974754558204769, + "grad_norm": 1.9015732601115234, + "learning_rate": 1.037615790244549e-06, + "loss": 0.3194, + "step": 5686 + }, + { + "epoch": 0.7976157082748948, + "grad_norm": 1.622848847538799, + "learning_rate": 1.0362309593225877e-06, + "loss": 0.2932, + "step": 5687 + }, + { + "epoch": 0.7977559607293128, + "grad_norm": 1.889292980739497, + "learning_rate": 1.0348469462966753e-06, + "loss": 0.3646, + "step": 5688 + }, + { + "epoch": 0.7978962131837307, + "grad_norm": 2.7106220160730223, + "learning_rate": 1.0334637514523927e-06, + "loss": 0.3489, + "step": 5689 + }, + { + "epoch": 0.7980364656381487, + "grad_norm": 2.2346860228992265, + "learning_rate": 1.0320813750751523e-06, + "loss": 0.329, + "step": 5690 + }, + { + "epoch": 0.7981767180925666, + "grad_norm": 2.0883379779022193, + "learning_rate": 1.030699817450198e-06, + "loss": 0.3688, + "step": 5691 + }, + { + "epoch": 0.7983169705469846, + "grad_norm": 2.085250226929884, + "learning_rate": 1.029319078862605e-06, + "loss": 0.3429, + "step": 5692 + }, + { + "epoch": 0.7984572230014025, + "grad_norm": 1.619173387810302, + "learning_rate": 1.0279391595972798e-06, + "loss": 0.355, + "step": 5693 + }, + { + "epoch": 0.7985974754558205, + "grad_norm": 1.773116229697513, + "learning_rate": 1.0265600599389569e-06, + "loss": 0.3614, + "step": 5694 + }, + { + "epoch": 0.7987377279102384, + "grad_norm": 1.829789862632043, + "learning_rate": 1.0251817801722047e-06, + "loss": 0.2866, + "step": 5695 + }, + { + "epoch": 0.7988779803646564, + "grad_norm": 1.8418024412942797, + "learning_rate": 1.0238043205814219e-06, + "loss": 0.3396, + "step": 5696 + }, + { + "epoch": 0.7990182328190744, + "grad_norm": 1.6423051496151726, + "learning_rate": 1.0224276814508376e-06, + "loss": 0.3135, + "step": 5697 + }, + { + "epoch": 0.7991584852734923, + "grad_norm": 3.7559773173190667, + "learning_rate": 1.0210518630645122e-06, + "loss": 0.3367, + "step": 5698 + }, + { + "epoch": 0.7992987377279103, + "grad_norm": 2.444374270106192, + "learning_rate": 1.0196768657063355e-06, + "loss": 0.2674, + "step": 5699 + }, + { + "epoch": 0.7994389901823282, + "grad_norm": 1.9377186559534711, + "learning_rate": 1.0183026896600284e-06, + "loss": 0.3008, + "step": 5700 + }, + { + "epoch": 0.7995792426367462, + "grad_norm": 1.8864328234875283, + "learning_rate": 1.0169293352091436e-06, + "loss": 0.3668, + "step": 5701 + }, + { + "epoch": 0.7997194950911641, + "grad_norm": 2.0545708800896247, + "learning_rate": 1.0155568026370637e-06, + "loss": 0.3194, + "step": 5702 + }, + { + "epoch": 0.7998597475455821, + "grad_norm": 1.9448815293503119, + "learning_rate": 1.0141850922269986e-06, + "loss": 0.3548, + "step": 5703 + }, + { + "epoch": 0.8, + "grad_norm": 5.412346373882036, + "learning_rate": 1.0128142042619938e-06, + "loss": 0.3274, + "step": 5704 + }, + { + "epoch": 0.800140252454418, + "grad_norm": 1.7511844713692766, + "learning_rate": 1.0114441390249202e-06, + "loss": 0.3228, + "step": 5705 + }, + { + "epoch": 0.800280504908836, + "grad_norm": 2.2507204385875608, + "learning_rate": 1.010074896798482e-06, + "loss": 0.3258, + "step": 5706 + }, + { + "epoch": 0.8004207573632539, + "grad_norm": 2.151957343944403, + "learning_rate": 1.0087064778652129e-06, + "loss": 0.3303, + "step": 5707 + }, + { + "epoch": 0.8005610098176719, + "grad_norm": 1.9601307459449562, + "learning_rate": 1.007338882507477e-06, + "loss": 0.3272, + "step": 5708 + }, + { + "epoch": 0.8007012622720897, + "grad_norm": 1.9591374741551855, + "learning_rate": 1.0059721110074678e-06, + "loss": 0.321, + "step": 5709 + }, + { + "epoch": 0.8008415147265077, + "grad_norm": 1.7443652718524574, + "learning_rate": 1.0046061636472087e-06, + "loss": 0.3227, + "step": 5710 + }, + { + "epoch": 0.8009817671809256, + "grad_norm": 2.572176457440362, + "learning_rate": 1.003241040708554e-06, + "loss": 0.3052, + "step": 5711 + }, + { + "epoch": 0.8011220196353436, + "grad_norm": 2.466239715260843, + "learning_rate": 1.0018767424731867e-06, + "loss": 0.3554, + "step": 5712 + }, + { + "epoch": 0.8012622720897615, + "grad_norm": 1.7741794462566438, + "learning_rate": 1.000513269222621e-06, + "loss": 0.3411, + "step": 5713 + }, + { + "epoch": 0.8014025245441795, + "grad_norm": 2.0104676735951377, + "learning_rate": 9.991506212382007e-07, + "loss": 0.3063, + "step": 5714 + }, + { + "epoch": 0.8015427769985974, + "grad_norm": 1.8451099875729509, + "learning_rate": 9.977887988010958e-07, + "loss": 0.3736, + "step": 5715 + }, + { + "epoch": 0.8016830294530154, + "grad_norm": 2.0101962059251623, + "learning_rate": 9.964278021923107e-07, + "loss": 0.4055, + "step": 5716 + }, + { + "epoch": 0.8018232819074333, + "grad_norm": 2.169130526092723, + "learning_rate": 9.950676316926777e-07, + "loss": 0.3094, + "step": 5717 + }, + { + "epoch": 0.8019635343618513, + "grad_norm": 3.2461397280231328, + "learning_rate": 9.937082875828586e-07, + "loss": 0.3499, + "step": 5718 + }, + { + "epoch": 0.8021037868162693, + "grad_norm": 2.040985019582136, + "learning_rate": 9.923497701433437e-07, + "loss": 0.3047, + "step": 5719 + }, + { + "epoch": 0.8022440392706872, + "grad_norm": 1.605090911286988, + "learning_rate": 9.909920796544542e-07, + "loss": 0.3059, + "step": 5720 + }, + { + "epoch": 0.8023842917251052, + "grad_norm": 2.5044523646355334, + "learning_rate": 9.896352163963397e-07, + "loss": 0.3628, + "step": 5721 + }, + { + "epoch": 0.8025245441795231, + "grad_norm": 2.461740707837052, + "learning_rate": 9.8827918064898e-07, + "loss": 0.3359, + "step": 5722 + }, + { + "epoch": 0.8026647966339411, + "grad_norm": 1.93196163460843, + "learning_rate": 9.869239726921843e-07, + "loss": 0.3268, + "step": 5723 + }, + { + "epoch": 0.802805049088359, + "grad_norm": 3.0180064863333698, + "learning_rate": 9.85569592805588e-07, + "loss": 0.3414, + "step": 5724 + }, + { + "epoch": 0.802945301542777, + "grad_norm": 2.031314085903108, + "learning_rate": 9.842160412686603e-07, + "loss": 0.363, + "step": 5725 + }, + { + "epoch": 0.803085553997195, + "grad_norm": 1.8243241534934667, + "learning_rate": 9.82863318360695e-07, + "loss": 0.3249, + "step": 5726 + }, + { + "epoch": 0.8032258064516129, + "grad_norm": 2.122215915535193, + "learning_rate": 9.815114243608182e-07, + "loss": 0.3526, + "step": 5727 + }, + { + "epoch": 0.8033660589060309, + "grad_norm": 1.8246452274119915, + "learning_rate": 9.801603595479831e-07, + "loss": 0.3472, + "step": 5728 + }, + { + "epoch": 0.8035063113604488, + "grad_norm": 1.949177407111033, + "learning_rate": 9.788101242009735e-07, + "loss": 0.3347, + "step": 5729 + }, + { + "epoch": 0.8036465638148668, + "grad_norm": 1.5108238548294288, + "learning_rate": 9.774607185984004e-07, + "loss": 0.3432, + "step": 5730 + }, + { + "epoch": 0.8037868162692847, + "grad_norm": 4.90478843738224, + "learning_rate": 9.761121430187037e-07, + "loss": 0.3552, + "step": 5731 + }, + { + "epoch": 0.8039270687237027, + "grad_norm": 1.9336230959736478, + "learning_rate": 9.747643977401538e-07, + "loss": 0.3237, + "step": 5732 + }, + { + "epoch": 0.8040673211781206, + "grad_norm": 3.037949385190606, + "learning_rate": 9.734174830408478e-07, + "loss": 0.3743, + "step": 5733 + }, + { + "epoch": 0.8042075736325386, + "grad_norm": 1.7365452016037863, + "learning_rate": 9.720713991987136e-07, + "loss": 0.3611, + "step": 5734 + }, + { + "epoch": 0.8043478260869565, + "grad_norm": 1.7765378361502226, + "learning_rate": 9.707261464915036e-07, + "loss": 0.3501, + "step": 5735 + }, + { + "epoch": 0.8044880785413745, + "grad_norm": 2.1466727482174486, + "learning_rate": 9.693817251968025e-07, + "loss": 0.2964, + "step": 5736 + }, + { + "epoch": 0.8046283309957925, + "grad_norm": 5.470628862513632, + "learning_rate": 9.68038135592022e-07, + "loss": 0.3643, + "step": 5737 + }, + { + "epoch": 0.8047685834502104, + "grad_norm": 2.0073348490239384, + "learning_rate": 9.666953779544025e-07, + "loss": 0.3078, + "step": 5738 + }, + { + "epoch": 0.8049088359046284, + "grad_norm": 2.07769552970156, + "learning_rate": 9.653534525610137e-07, + "loss": 0.3354, + "step": 5739 + }, + { + "epoch": 0.8050490883590463, + "grad_norm": 1.6301909944467379, + "learning_rate": 9.640123596887507e-07, + "loss": 0.313, + "step": 5740 + }, + { + "epoch": 0.8051893408134643, + "grad_norm": 2.2509133286603618, + "learning_rate": 9.626720996143407e-07, + "loss": 0.3685, + "step": 5741 + }, + { + "epoch": 0.8053295932678822, + "grad_norm": 1.8537578203444045, + "learning_rate": 9.613326726143352e-07, + "loss": 0.2815, + "step": 5742 + }, + { + "epoch": 0.8054698457223002, + "grad_norm": 1.7312063879586224, + "learning_rate": 9.59994078965118e-07, + "loss": 0.3316, + "step": 5743 + }, + { + "epoch": 0.8056100981767181, + "grad_norm": 2.2204873173915622, + "learning_rate": 9.586563189428954e-07, + "loss": 0.3443, + "step": 5744 + }, + { + "epoch": 0.8057503506311361, + "grad_norm": 2.475509564126404, + "learning_rate": 9.573193928237073e-07, + "loss": 0.3344, + "step": 5745 + }, + { + "epoch": 0.805890603085554, + "grad_norm": 2.4244828488779158, + "learning_rate": 9.559833008834175e-07, + "loss": 0.3511, + "step": 5746 + }, + { + "epoch": 0.806030855539972, + "grad_norm": 2.298858512735912, + "learning_rate": 9.546480433977195e-07, + "loss": 0.3297, + "step": 5747 + }, + { + "epoch": 0.80617110799439, + "grad_norm": 1.9968073515559546, + "learning_rate": 9.533136206421345e-07, + "loss": 0.3669, + "step": 5748 + }, + { + "epoch": 0.8063113604488078, + "grad_norm": 1.7465393726662555, + "learning_rate": 9.519800328920115e-07, + "loss": 0.3508, + "step": 5749 + }, + { + "epoch": 0.8064516129032258, + "grad_norm": 2.263841861270207, + "learning_rate": 9.50647280422527e-07, + "loss": 0.3354, + "step": 5750 + }, + { + "epoch": 0.8065918653576437, + "grad_norm": 1.8702609890861124, + "learning_rate": 9.493153635086855e-07, + "loss": 0.3091, + "step": 5751 + }, + { + "epoch": 0.8067321178120617, + "grad_norm": 1.8627230849850065, + "learning_rate": 9.479842824253182e-07, + "loss": 0.3218, + "step": 5752 + }, + { + "epoch": 0.8068723702664796, + "grad_norm": 2.165996252167648, + "learning_rate": 9.466540374470845e-07, + "loss": 0.316, + "step": 5753 + }, + { + "epoch": 0.8070126227208976, + "grad_norm": 1.817937193474944, + "learning_rate": 9.453246288484713e-07, + "loss": 0.3506, + "step": 5754 + }, + { + "epoch": 0.8071528751753155, + "grad_norm": 2.0764247277383734, + "learning_rate": 9.439960569037943e-07, + "loss": 0.3247, + "step": 5755 + }, + { + "epoch": 0.8072931276297335, + "grad_norm": 2.2852675712401522, + "learning_rate": 9.426683218871918e-07, + "loss": 0.3002, + "step": 5756 + }, + { + "epoch": 0.8074333800841514, + "grad_norm": 2.443217394612114, + "learning_rate": 9.413414240726349e-07, + "loss": 0.3252, + "step": 5757 + }, + { + "epoch": 0.8075736325385694, + "grad_norm": 1.990428072831685, + "learning_rate": 9.400153637339182e-07, + "loss": 0.3132, + "step": 5758 + }, + { + "epoch": 0.8077138849929874, + "grad_norm": 1.68767101692836, + "learning_rate": 9.386901411446664e-07, + "loss": 0.3247, + "step": 5759 + }, + { + "epoch": 0.8078541374474053, + "grad_norm": 1.8257938095659758, + "learning_rate": 9.373657565783295e-07, + "loss": 0.3431, + "step": 5760 + }, + { + "epoch": 0.8079943899018233, + "grad_norm": 1.7401334003305737, + "learning_rate": 9.360422103081851e-07, + "loss": 0.3446, + "step": 5761 + }, + { + "epoch": 0.8081346423562412, + "grad_norm": 2.1734092288231954, + "learning_rate": 9.347195026073369e-07, + "loss": 0.3573, + "step": 5762 + }, + { + "epoch": 0.8082748948106592, + "grad_norm": 2.6941365648360462, + "learning_rate": 9.333976337487178e-07, + "loss": 0.3428, + "step": 5763 + }, + { + "epoch": 0.8084151472650771, + "grad_norm": 1.725155909030087, + "learning_rate": 9.32076604005086e-07, + "loss": 0.3378, + "step": 5764 + }, + { + "epoch": 0.8085553997194951, + "grad_norm": 1.7444695550214129, + "learning_rate": 9.307564136490255e-07, + "loss": 0.3244, + "step": 5765 + }, + { + "epoch": 0.808695652173913, + "grad_norm": 2.642887838840806, + "learning_rate": 9.294370629529503e-07, + "loss": 0.3453, + "step": 5766 + }, + { + "epoch": 0.808835904628331, + "grad_norm": 2.0914858062284405, + "learning_rate": 9.281185521890962e-07, + "loss": 0.3736, + "step": 5767 + }, + { + "epoch": 0.808976157082749, + "grad_norm": 2.053682610955385, + "learning_rate": 9.26800881629531e-07, + "loss": 0.3152, + "step": 5768 + }, + { + "epoch": 0.8091164095371669, + "grad_norm": 2.240800381956439, + "learning_rate": 9.254840515461455e-07, + "loss": 0.3729, + "step": 5769 + }, + { + "epoch": 0.8092566619915849, + "grad_norm": 3.133655006724875, + "learning_rate": 9.241680622106597e-07, + "loss": 0.3404, + "step": 5770 + }, + { + "epoch": 0.8093969144460028, + "grad_norm": 2.8585078691257793, + "learning_rate": 9.22852913894618e-07, + "loss": 0.3217, + "step": 5771 + }, + { + "epoch": 0.8095371669004208, + "grad_norm": 1.7039475860549054, + "learning_rate": 9.215386068693927e-07, + "loss": 0.3158, + "step": 5772 + }, + { + "epoch": 0.8096774193548387, + "grad_norm": 2.0100561289755268, + "learning_rate": 9.202251414061813e-07, + "loss": 0.3353, + "step": 5773 + }, + { + "epoch": 0.8098176718092567, + "grad_norm": 2.3034458759855787, + "learning_rate": 9.189125177760083e-07, + "loss": 0.3192, + "step": 5774 + }, + { + "epoch": 0.8099579242636746, + "grad_norm": 1.9723596330988504, + "learning_rate": 9.176007362497258e-07, + "loss": 0.3281, + "step": 5775 + }, + { + "epoch": 0.8100981767180926, + "grad_norm": 2.0757019142665087, + "learning_rate": 9.162897970980083e-07, + "loss": 0.3266, + "step": 5776 + }, + { + "epoch": 0.8102384291725105, + "grad_norm": 3.415380716619031, + "learning_rate": 9.149797005913602e-07, + "loss": 0.3546, + "step": 5777 + }, + { + "epoch": 0.8103786816269285, + "grad_norm": 1.8925355447835168, + "learning_rate": 9.136704470001101e-07, + "loss": 0.3493, + "step": 5778 + }, + { + "epoch": 0.8105189340813465, + "grad_norm": 2.4612542811366676, + "learning_rate": 9.123620365944147e-07, + "loss": 0.3535, + "step": 5779 + }, + { + "epoch": 0.8106591865357644, + "grad_norm": 2.3744879722780583, + "learning_rate": 9.110544696442542e-07, + "loss": 0.3671, + "step": 5780 + }, + { + "epoch": 0.8107994389901824, + "grad_norm": 2.46101248490671, + "learning_rate": 9.097477464194359e-07, + "loss": 0.3673, + "step": 5781 + }, + { + "epoch": 0.8109396914446003, + "grad_norm": 2.339743483087492, + "learning_rate": 9.084418671895939e-07, + "loss": 0.2883, + "step": 5782 + }, + { + "epoch": 0.8110799438990183, + "grad_norm": 1.8807450969834456, + "learning_rate": 9.071368322241864e-07, + "loss": 0.3713, + "step": 5783 + }, + { + "epoch": 0.8112201963534362, + "grad_norm": 1.9206210131468826, + "learning_rate": 9.058326417925001e-07, + "loss": 0.3804, + "step": 5784 + }, + { + "epoch": 0.8113604488078542, + "grad_norm": 1.8776017893133752, + "learning_rate": 9.045292961636426e-07, + "loss": 0.3398, + "step": 5785 + }, + { + "epoch": 0.8115007012622721, + "grad_norm": 1.8245361182750697, + "learning_rate": 9.032267956065516e-07, + "loss": 0.3392, + "step": 5786 + }, + { + "epoch": 0.8116409537166901, + "grad_norm": 2.105556847139408, + "learning_rate": 9.019251403899903e-07, + "loss": 0.3409, + "step": 5787 + }, + { + "epoch": 0.811781206171108, + "grad_norm": 7.110580845109208, + "learning_rate": 9.006243307825435e-07, + "loss": 0.3287, + "step": 5788 + }, + { + "epoch": 0.8119214586255259, + "grad_norm": 2.004054608718371, + "learning_rate": 8.993243670526258e-07, + "loss": 0.3089, + "step": 5789 + }, + { + "epoch": 0.8120617110799438, + "grad_norm": 1.739839759551441, + "learning_rate": 8.980252494684749e-07, + "loss": 0.3655, + "step": 5790 + }, + { + "epoch": 0.8122019635343618, + "grad_norm": 1.5793280217951071, + "learning_rate": 8.967269782981558e-07, + "loss": 0.3239, + "step": 5791 + }, + { + "epoch": 0.8123422159887798, + "grad_norm": 2.0103331896065493, + "learning_rate": 8.954295538095564e-07, + "loss": 0.3498, + "step": 5792 + }, + { + "epoch": 0.8124824684431977, + "grad_norm": 1.651150175393771, + "learning_rate": 8.941329762703921e-07, + "loss": 0.3118, + "step": 5793 + }, + { + "epoch": 0.8126227208976157, + "grad_norm": 2.254585785134493, + "learning_rate": 8.928372459482021e-07, + "loss": 0.356, + "step": 5794 + }, + { + "epoch": 0.8127629733520336, + "grad_norm": 2.126751794735609, + "learning_rate": 8.915423631103514e-07, + "loss": 0.3477, + "step": 5795 + }, + { + "epoch": 0.8129032258064516, + "grad_norm": 1.6356049528485896, + "learning_rate": 8.902483280240315e-07, + "loss": 0.3511, + "step": 5796 + }, + { + "epoch": 0.8130434782608695, + "grad_norm": 1.9790083035472674, + "learning_rate": 8.889551409562552e-07, + "loss": 0.3221, + "step": 5797 + }, + { + "epoch": 0.8131837307152875, + "grad_norm": 2.487666894115915, + "learning_rate": 8.876628021738631e-07, + "loss": 0.3378, + "step": 5798 + }, + { + "epoch": 0.8133239831697054, + "grad_norm": 2.0407927322970085, + "learning_rate": 8.863713119435208e-07, + "loss": 0.3623, + "step": 5799 + }, + { + "epoch": 0.8134642356241234, + "grad_norm": 2.1613033622124274, + "learning_rate": 8.850806705317183e-07, + "loss": 0.3229, + "step": 5800 + }, + { + "epoch": 0.8136044880785414, + "grad_norm": 2.0114786245493743, + "learning_rate": 8.8379087820477e-07, + "loss": 0.3531, + "step": 5801 + }, + { + "epoch": 0.8137447405329593, + "grad_norm": 1.941715453794252, + "learning_rate": 8.825019352288162e-07, + "loss": 0.3708, + "step": 5802 + }, + { + "epoch": 0.8138849929873773, + "grad_norm": 1.9075044350428787, + "learning_rate": 8.812138418698207e-07, + "loss": 0.324, + "step": 5803 + }, + { + "epoch": 0.8140252454417952, + "grad_norm": 1.5082312284161705, + "learning_rate": 8.799265983935734e-07, + "loss": 0.3293, + "step": 5804 + }, + { + "epoch": 0.8141654978962132, + "grad_norm": 3.839564083311218, + "learning_rate": 8.786402050656878e-07, + "loss": 0.3563, + "step": 5805 + }, + { + "epoch": 0.8143057503506311, + "grad_norm": 2.7640195162866656, + "learning_rate": 8.77354662151601e-07, + "loss": 0.3078, + "step": 5806 + }, + { + "epoch": 0.8144460028050491, + "grad_norm": 1.7297261110594955, + "learning_rate": 8.76069969916577e-07, + "loss": 0.3557, + "step": 5807 + }, + { + "epoch": 0.814586255259467, + "grad_norm": 1.9786278710717704, + "learning_rate": 8.747861286257031e-07, + "loss": 0.3463, + "step": 5808 + }, + { + "epoch": 0.814726507713885, + "grad_norm": 2.1937101224660274, + "learning_rate": 8.735031385438897e-07, + "loss": 0.3164, + "step": 5809 + }, + { + "epoch": 0.814866760168303, + "grad_norm": 3.2108051801830535, + "learning_rate": 8.722209999358738e-07, + "loss": 0.3168, + "step": 5810 + }, + { + "epoch": 0.8150070126227209, + "grad_norm": 2.5617230796021184, + "learning_rate": 8.709397130662151e-07, + "loss": 0.3707, + "step": 5811 + }, + { + "epoch": 0.8151472650771389, + "grad_norm": 1.6767654907548426, + "learning_rate": 8.696592781992991e-07, + "loss": 0.3561, + "step": 5812 + }, + { + "epoch": 0.8152875175315568, + "grad_norm": 1.7958386293526407, + "learning_rate": 8.68379695599334e-07, + "loss": 0.3264, + "step": 5813 + }, + { + "epoch": 0.8154277699859748, + "grad_norm": 1.8976315643978994, + "learning_rate": 8.671009655303531e-07, + "loss": 0.3521, + "step": 5814 + }, + { + "epoch": 0.8155680224403927, + "grad_norm": 1.704093028049904, + "learning_rate": 8.658230882562135e-07, + "loss": 0.3058, + "step": 5815 + }, + { + "epoch": 0.8157082748948107, + "grad_norm": 2.0508220559556336, + "learning_rate": 8.645460640405967e-07, + "loss": 0.3677, + "step": 5816 + }, + { + "epoch": 0.8158485273492286, + "grad_norm": 3.168897161132635, + "learning_rate": 8.632698931470063e-07, + "loss": 0.356, + "step": 5817 + }, + { + "epoch": 0.8159887798036466, + "grad_norm": 1.8862199775627215, + "learning_rate": 8.619945758387716e-07, + "loss": 0.3351, + "step": 5818 + }, + { + "epoch": 0.8161290322580645, + "grad_norm": 1.7788708508654691, + "learning_rate": 8.60720112379046e-07, + "loss": 0.3257, + "step": 5819 + }, + { + "epoch": 0.8162692847124825, + "grad_norm": 2.1905164357056157, + "learning_rate": 8.594465030308052e-07, + "loss": 0.3206, + "step": 5820 + }, + { + "epoch": 0.8164095371669005, + "grad_norm": 1.8315833872131473, + "learning_rate": 8.581737480568514e-07, + "loss": 0.3528, + "step": 5821 + }, + { + "epoch": 0.8165497896213184, + "grad_norm": 1.62700815096304, + "learning_rate": 8.569018477198065e-07, + "loss": 0.3356, + "step": 5822 + }, + { + "epoch": 0.8166900420757364, + "grad_norm": 1.7163175536528217, + "learning_rate": 8.556308022821202e-07, + "loss": 0.3166, + "step": 5823 + }, + { + "epoch": 0.8168302945301543, + "grad_norm": 2.0827485471781526, + "learning_rate": 8.543606120060627e-07, + "loss": 0.3316, + "step": 5824 + }, + { + "epoch": 0.8169705469845723, + "grad_norm": 2.397592998526746, + "learning_rate": 8.530912771537303e-07, + "loss": 0.3386, + "step": 5825 + }, + { + "epoch": 0.8171107994389902, + "grad_norm": 2.0602279633245097, + "learning_rate": 8.518227979870392e-07, + "loss": 0.3686, + "step": 5826 + }, + { + "epoch": 0.8172510518934082, + "grad_norm": 1.5238339421822042, + "learning_rate": 8.505551747677321e-07, + "loss": 0.3418, + "step": 5827 + }, + { + "epoch": 0.8173913043478261, + "grad_norm": 2.147094417725767, + "learning_rate": 8.492884077573749e-07, + "loss": 0.3196, + "step": 5828 + }, + { + "epoch": 0.817531556802244, + "grad_norm": 1.8971183180883944, + "learning_rate": 8.480224972173562e-07, + "loss": 0.3312, + "step": 5829 + }, + { + "epoch": 0.8176718092566619, + "grad_norm": 1.7751109424359743, + "learning_rate": 8.46757443408886e-07, + "loss": 0.3139, + "step": 5830 + }, + { + "epoch": 0.8178120617110799, + "grad_norm": 2.3422132413033654, + "learning_rate": 8.45493246593001e-07, + "loss": 0.3481, + "step": 5831 + }, + { + "epoch": 0.8179523141654979, + "grad_norm": 1.8702055349970348, + "learning_rate": 8.442299070305582e-07, + "loss": 0.3601, + "step": 5832 + }, + { + "epoch": 0.8180925666199158, + "grad_norm": 1.88676588422135, + "learning_rate": 8.429674249822401e-07, + "loss": 0.3621, + "step": 5833 + }, + { + "epoch": 0.8182328190743338, + "grad_norm": 1.7514782520352044, + "learning_rate": 8.417058007085505e-07, + "loss": 0.3692, + "step": 5834 + }, + { + "epoch": 0.8183730715287517, + "grad_norm": 2.0023376393308654, + "learning_rate": 8.404450344698167e-07, + "loss": 0.3087, + "step": 5835 + }, + { + "epoch": 0.8185133239831697, + "grad_norm": 2.36910278592, + "learning_rate": 8.391851265261886e-07, + "loss": 0.3207, + "step": 5836 + }, + { + "epoch": 0.8186535764375876, + "grad_norm": 1.7855773696181456, + "learning_rate": 8.379260771376419e-07, + "loss": 0.2824, + "step": 5837 + }, + { + "epoch": 0.8187938288920056, + "grad_norm": 2.798208078834312, + "learning_rate": 8.366678865639688e-07, + "loss": 0.3459, + "step": 5838 + }, + { + "epoch": 0.8189340813464235, + "grad_norm": 1.8766746671696948, + "learning_rate": 8.354105550647901e-07, + "loss": 0.363, + "step": 5839 + }, + { + "epoch": 0.8190743338008415, + "grad_norm": 1.9517239848167403, + "learning_rate": 8.341540828995476e-07, + "loss": 0.3131, + "step": 5840 + }, + { + "epoch": 0.8192145862552594, + "grad_norm": 1.5362395912109557, + "learning_rate": 8.32898470327505e-07, + "loss": 0.2952, + "step": 5841 + }, + { + "epoch": 0.8193548387096774, + "grad_norm": 2.3947829445614692, + "learning_rate": 8.316437176077491e-07, + "loss": 0.3508, + "step": 5842 + }, + { + "epoch": 0.8194950911640954, + "grad_norm": 2.0311315648690442, + "learning_rate": 8.303898249991899e-07, + "loss": 0.3039, + "step": 5843 + }, + { + "epoch": 0.8196353436185133, + "grad_norm": 1.9421654182093329, + "learning_rate": 8.291367927605592e-07, + "loss": 0.3526, + "step": 5844 + }, + { + "epoch": 0.8197755960729313, + "grad_norm": 2.7752537757374323, + "learning_rate": 8.27884621150411e-07, + "loss": 0.3279, + "step": 5845 + }, + { + "epoch": 0.8199158485273492, + "grad_norm": 1.7460376295249311, + "learning_rate": 8.266333104271241e-07, + "loss": 0.3221, + "step": 5846 + }, + { + "epoch": 0.8200561009817672, + "grad_norm": 2.0667482031868265, + "learning_rate": 8.253828608488946e-07, + "loss": 0.3135, + "step": 5847 + }, + { + "epoch": 0.8201963534361851, + "grad_norm": 1.9865050410550704, + "learning_rate": 8.241332726737455e-07, + "loss": 0.3021, + "step": 5848 + }, + { + "epoch": 0.8203366058906031, + "grad_norm": 1.5112588661003188, + "learning_rate": 8.228845461595225e-07, + "loss": 0.3727, + "step": 5849 + }, + { + "epoch": 0.820476858345021, + "grad_norm": 2.0461429131228472, + "learning_rate": 8.216366815638882e-07, + "loss": 0.3135, + "step": 5850 + }, + { + "epoch": 0.820617110799439, + "grad_norm": 2.0007325261449407, + "learning_rate": 8.203896791443322e-07, + "loss": 0.3279, + "step": 5851 + }, + { + "epoch": 0.820757363253857, + "grad_norm": 1.599781516178261, + "learning_rate": 8.191435391581648e-07, + "loss": 0.3065, + "step": 5852 + }, + { + "epoch": 0.8208976157082749, + "grad_norm": 1.703463669448008, + "learning_rate": 8.178982618625186e-07, + "loss": 0.3008, + "step": 5853 + }, + { + "epoch": 0.8210378681626929, + "grad_norm": 2.1060813904238365, + "learning_rate": 8.16653847514347e-07, + "loss": 0.3158, + "step": 5854 + }, + { + "epoch": 0.8211781206171108, + "grad_norm": 1.9188823815818137, + "learning_rate": 8.154102963704274e-07, + "loss": 0.2834, + "step": 5855 + }, + { + "epoch": 0.8213183730715288, + "grad_norm": 1.7954208460850856, + "learning_rate": 8.141676086873574e-07, + "loss": 0.3243, + "step": 5856 + }, + { + "epoch": 0.8214586255259467, + "grad_norm": 1.8033431689733954, + "learning_rate": 8.129257847215571e-07, + "loss": 0.3648, + "step": 5857 + }, + { + "epoch": 0.8215988779803647, + "grad_norm": 1.750238402340337, + "learning_rate": 8.116848247292674e-07, + "loss": 0.3353, + "step": 5858 + }, + { + "epoch": 0.8217391304347826, + "grad_norm": 2.0612695665049636, + "learning_rate": 8.104447289665523e-07, + "loss": 0.3537, + "step": 5859 + }, + { + "epoch": 0.8218793828892006, + "grad_norm": 1.6407404541082031, + "learning_rate": 8.092054976892966e-07, + "loss": 0.317, + "step": 5860 + }, + { + "epoch": 0.8220196353436185, + "grad_norm": 3.5031199854220216, + "learning_rate": 8.079671311532072e-07, + "loss": 0.3405, + "step": 5861 + }, + { + "epoch": 0.8221598877980365, + "grad_norm": 2.935043769003174, + "learning_rate": 8.067296296138128e-07, + "loss": 0.3584, + "step": 5862 + }, + { + "epoch": 0.8223001402524545, + "grad_norm": 2.8804809735594654, + "learning_rate": 8.054929933264626e-07, + "loss": 0.3662, + "step": 5863 + }, + { + "epoch": 0.8224403927068724, + "grad_norm": 1.9098530472150959, + "learning_rate": 8.04257222546328e-07, + "loss": 0.4007, + "step": 5864 + }, + { + "epoch": 0.8225806451612904, + "grad_norm": 2.869083120913703, + "learning_rate": 8.030223175284019e-07, + "loss": 0.3405, + "step": 5865 + }, + { + "epoch": 0.8227208976157083, + "grad_norm": 2.2868398870841253, + "learning_rate": 8.017882785274988e-07, + "loss": 0.3353, + "step": 5866 + }, + { + "epoch": 0.8228611500701263, + "grad_norm": 1.9512142142436164, + "learning_rate": 8.005551057982531e-07, + "loss": 0.3313, + "step": 5867 + }, + { + "epoch": 0.8230014025245442, + "grad_norm": 1.6856202853274438, + "learning_rate": 7.993227995951208e-07, + "loss": 0.3183, + "step": 5868 + }, + { + "epoch": 0.8231416549789621, + "grad_norm": 2.0861045970496166, + "learning_rate": 7.980913601723811e-07, + "loss": 0.2856, + "step": 5869 + }, + { + "epoch": 0.82328190743338, + "grad_norm": 2.25674561105616, + "learning_rate": 7.968607877841333e-07, + "loss": 0.3462, + "step": 5870 + }, + { + "epoch": 0.823422159887798, + "grad_norm": 2.233675760616221, + "learning_rate": 7.956310826842955e-07, + "loss": 0.3526, + "step": 5871 + }, + { + "epoch": 0.8235624123422159, + "grad_norm": 1.7621768601081615, + "learning_rate": 7.944022451266098e-07, + "loss": 0.3397, + "step": 5872 + }, + { + "epoch": 0.8237026647966339, + "grad_norm": 2.678401360335284, + "learning_rate": 7.931742753646382e-07, + "loss": 0.3252, + "step": 5873 + }, + { + "epoch": 0.8238429172510519, + "grad_norm": 1.8346189918261142, + "learning_rate": 7.919471736517631e-07, + "loss": 0.3271, + "step": 5874 + }, + { + "epoch": 0.8239831697054698, + "grad_norm": 2.310120260237604, + "learning_rate": 7.907209402411897e-07, + "loss": 0.3598, + "step": 5875 + }, + { + "epoch": 0.8241234221598878, + "grad_norm": 1.8612980234728231, + "learning_rate": 7.894955753859412e-07, + "loss": 0.3267, + "step": 5876 + }, + { + "epoch": 0.8242636746143057, + "grad_norm": 1.676075018184822, + "learning_rate": 7.882710793388643e-07, + "loss": 0.3227, + "step": 5877 + }, + { + "epoch": 0.8244039270687237, + "grad_norm": 1.859567712365909, + "learning_rate": 7.870474523526262e-07, + "loss": 0.3273, + "step": 5878 + }, + { + "epoch": 0.8245441795231416, + "grad_norm": 2.1535659892093277, + "learning_rate": 7.858246946797104e-07, + "loss": 0.2998, + "step": 5879 + }, + { + "epoch": 0.8246844319775596, + "grad_norm": 1.7578729915806797, + "learning_rate": 7.846028065724264e-07, + "loss": 0.3407, + "step": 5880 + }, + { + "epoch": 0.8248246844319775, + "grad_norm": 1.6492877231994678, + "learning_rate": 7.833817882829025e-07, + "loss": 0.3128, + "step": 5881 + }, + { + "epoch": 0.8249649368863955, + "grad_norm": 2.6111116398028478, + "learning_rate": 7.821616400630866e-07, + "loss": 0.3265, + "step": 5882 + }, + { + "epoch": 0.8251051893408134, + "grad_norm": 2.957570761239578, + "learning_rate": 7.809423621647483e-07, + "loss": 0.3168, + "step": 5883 + }, + { + "epoch": 0.8252454417952314, + "grad_norm": 1.9169339187169225, + "learning_rate": 7.79723954839477e-07, + "loss": 0.3645, + "step": 5884 + }, + { + "epoch": 0.8253856942496494, + "grad_norm": 1.5261219337714407, + "learning_rate": 7.785064183386826e-07, + "loss": 0.3282, + "step": 5885 + }, + { + "epoch": 0.8255259467040673, + "grad_norm": 2.8281430407206876, + "learning_rate": 7.772897529135947e-07, + "loss": 0.3213, + "step": 5886 + }, + { + "epoch": 0.8256661991584853, + "grad_norm": 1.9784204247364008, + "learning_rate": 7.760739588152655e-07, + "loss": 0.314, + "step": 5887 + }, + { + "epoch": 0.8258064516129032, + "grad_norm": 2.4941880379205292, + "learning_rate": 7.74859036294563e-07, + "loss": 0.3144, + "step": 5888 + }, + { + "epoch": 0.8259467040673212, + "grad_norm": 1.9567120990428433, + "learning_rate": 7.736449856021788e-07, + "loss": 0.3128, + "step": 5889 + }, + { + "epoch": 0.8260869565217391, + "grad_norm": 1.8834135750187262, + "learning_rate": 7.72431806988625e-07, + "loss": 0.3328, + "step": 5890 + }, + { + "epoch": 0.8262272089761571, + "grad_norm": 1.8822348567268872, + "learning_rate": 7.712195007042322e-07, + "loss": 0.3549, + "step": 5891 + }, + { + "epoch": 0.826367461430575, + "grad_norm": 1.8272646974553661, + "learning_rate": 7.7000806699915e-07, + "loss": 0.3552, + "step": 5892 + }, + { + "epoch": 0.826507713884993, + "grad_norm": 1.8917971673839595, + "learning_rate": 7.687975061233499e-07, + "loss": 0.3492, + "step": 5893 + }, + { + "epoch": 0.826647966339411, + "grad_norm": 1.9909909114160795, + "learning_rate": 7.675878183266228e-07, + "loss": 0.3274, + "step": 5894 + }, + { + "epoch": 0.8267882187938289, + "grad_norm": 1.8233574543228601, + "learning_rate": 7.663790038585794e-07, + "loss": 0.3108, + "step": 5895 + }, + { + "epoch": 0.8269284712482469, + "grad_norm": 1.8419453469037088, + "learning_rate": 7.651710629686504e-07, + "loss": 0.3397, + "step": 5896 + }, + { + "epoch": 0.8270687237026648, + "grad_norm": 1.6867987977292953, + "learning_rate": 7.639639959060857e-07, + "loss": 0.3742, + "step": 5897 + }, + { + "epoch": 0.8272089761570828, + "grad_norm": 2.34353250130135, + "learning_rate": 7.627578029199562e-07, + "loss": 0.3365, + "step": 5898 + }, + { + "epoch": 0.8273492286115007, + "grad_norm": 1.7427913149701546, + "learning_rate": 7.615524842591493e-07, + "loss": 0.3783, + "step": 5899 + }, + { + "epoch": 0.8274894810659187, + "grad_norm": 1.8866160741948856, + "learning_rate": 7.603480401723745e-07, + "loss": 0.3301, + "step": 5900 + }, + { + "epoch": 0.8276297335203366, + "grad_norm": 1.851636796579322, + "learning_rate": 7.591444709081619e-07, + "loss": 0.335, + "step": 5901 + }, + { + "epoch": 0.8277699859747546, + "grad_norm": 2.039721386152992, + "learning_rate": 7.579417767148583e-07, + "loss": 0.3804, + "step": 5902 + }, + { + "epoch": 0.8279102384291726, + "grad_norm": 1.9702069779892553, + "learning_rate": 7.56739957840632e-07, + "loss": 0.3229, + "step": 5903 + }, + { + "epoch": 0.8280504908835905, + "grad_norm": 1.8008265955050329, + "learning_rate": 7.555390145334696e-07, + "loss": 0.3151, + "step": 5904 + }, + { + "epoch": 0.8281907433380085, + "grad_norm": 2.0113235826231723, + "learning_rate": 7.543389470411772e-07, + "loss": 0.3157, + "step": 5905 + }, + { + "epoch": 0.8283309957924264, + "grad_norm": 1.8861464744152876, + "learning_rate": 7.531397556113806e-07, + "loss": 0.3626, + "step": 5906 + }, + { + "epoch": 0.8284712482468444, + "grad_norm": 1.8546790440148893, + "learning_rate": 7.519414404915254e-07, + "loss": 0.3247, + "step": 5907 + }, + { + "epoch": 0.8286115007012623, + "grad_norm": 2.0027327940614494, + "learning_rate": 7.507440019288742e-07, + "loss": 0.3327, + "step": 5908 + }, + { + "epoch": 0.8287517531556802, + "grad_norm": 2.647663374660763, + "learning_rate": 7.4954744017051e-07, + "loss": 0.3403, + "step": 5909 + }, + { + "epoch": 0.8288920056100981, + "grad_norm": 1.8293053492615146, + "learning_rate": 7.483517554633357e-07, + "loss": 0.3211, + "step": 5910 + }, + { + "epoch": 0.8290322580645161, + "grad_norm": 3.723512949241147, + "learning_rate": 7.471569480540725e-07, + "loss": 0.3301, + "step": 5911 + }, + { + "epoch": 0.829172510518934, + "grad_norm": 2.2210324864319597, + "learning_rate": 7.459630181892608e-07, + "loss": 0.2945, + "step": 5912 + }, + { + "epoch": 0.829312762973352, + "grad_norm": 2.494243479305422, + "learning_rate": 7.447699661152586e-07, + "loss": 0.3431, + "step": 5913 + }, + { + "epoch": 0.82945301542777, + "grad_norm": 2.2497983418270935, + "learning_rate": 7.435777920782444e-07, + "loss": 0.3238, + "step": 5914 + }, + { + "epoch": 0.8295932678821879, + "grad_norm": 1.6138152561837147, + "learning_rate": 7.423864963242155e-07, + "loss": 0.3337, + "step": 5915 + }, + { + "epoch": 0.8297335203366059, + "grad_norm": 1.659031745193029, + "learning_rate": 7.411960790989863e-07, + "loss": 0.3253, + "step": 5916 + }, + { + "epoch": 0.8298737727910238, + "grad_norm": 1.596046364146649, + "learning_rate": 7.400065406481926e-07, + "loss": 0.3564, + "step": 5917 + }, + { + "epoch": 0.8300140252454418, + "grad_norm": 2.411427958374746, + "learning_rate": 7.388178812172859e-07, + "loss": 0.3284, + "step": 5918 + }, + { + "epoch": 0.8301542776998597, + "grad_norm": 1.6704416280422771, + "learning_rate": 7.376301010515397e-07, + "loss": 0.3316, + "step": 5919 + }, + { + "epoch": 0.8302945301542777, + "grad_norm": 2.0502537221438706, + "learning_rate": 7.36443200396042e-07, + "loss": 0.3456, + "step": 5920 + }, + { + "epoch": 0.8304347826086956, + "grad_norm": 2.784069903405944, + "learning_rate": 7.352571794957025e-07, + "loss": 0.2897, + "step": 5921 + }, + { + "epoch": 0.8305750350631136, + "grad_norm": 1.8757480400733109, + "learning_rate": 7.340720385952476e-07, + "loss": 0.3049, + "step": 5922 + }, + { + "epoch": 0.8307152875175315, + "grad_norm": 1.7903903880716123, + "learning_rate": 7.328877779392235e-07, + "loss": 0.3363, + "step": 5923 + }, + { + "epoch": 0.8308555399719495, + "grad_norm": 1.9409527930248527, + "learning_rate": 7.317043977719945e-07, + "loss": 0.3838, + "step": 5924 + }, + { + "epoch": 0.8309957924263675, + "grad_norm": 1.875694464185648, + "learning_rate": 7.305218983377422e-07, + "loss": 0.2978, + "step": 5925 + }, + { + "epoch": 0.8311360448807854, + "grad_norm": 2.2371251181142955, + "learning_rate": 7.293402798804667e-07, + "loss": 0.3491, + "step": 5926 + }, + { + "epoch": 0.8312762973352034, + "grad_norm": 2.176854853223327, + "learning_rate": 7.281595426439875e-07, + "loss": 0.3173, + "step": 5927 + }, + { + "epoch": 0.8314165497896213, + "grad_norm": 2.2230408776523607, + "learning_rate": 7.269796868719426e-07, + "loss": 0.3249, + "step": 5928 + }, + { + "epoch": 0.8315568022440393, + "grad_norm": 7.15568893432043, + "learning_rate": 7.258007128077843e-07, + "loss": 0.3476, + "step": 5929 + }, + { + "epoch": 0.8316970546984572, + "grad_norm": 1.735562579495822, + "learning_rate": 7.24622620694787e-07, + "loss": 0.334, + "step": 5930 + }, + { + "epoch": 0.8318373071528752, + "grad_norm": 2.2811862466343023, + "learning_rate": 7.23445410776042e-07, + "loss": 0.3338, + "step": 5931 + }, + { + "epoch": 0.8319775596072931, + "grad_norm": 1.8630811155948452, + "learning_rate": 7.222690832944579e-07, + "loss": 0.3498, + "step": 5932 + }, + { + "epoch": 0.8321178120617111, + "grad_norm": 1.8348565348534098, + "learning_rate": 7.210936384927631e-07, + "loss": 0.3594, + "step": 5933 + }, + { + "epoch": 0.832258064516129, + "grad_norm": 1.609483511516469, + "learning_rate": 7.199190766135001e-07, + "loss": 0.3169, + "step": 5934 + }, + { + "epoch": 0.832398316970547, + "grad_norm": 2.3220203420912773, + "learning_rate": 7.187453978990328e-07, + "loss": 0.3099, + "step": 5935 + }, + { + "epoch": 0.832538569424965, + "grad_norm": 1.7907693375338614, + "learning_rate": 7.175726025915409e-07, + "loss": 0.3197, + "step": 5936 + }, + { + "epoch": 0.8326788218793829, + "grad_norm": 1.9922263945728322, + "learning_rate": 7.164006909330234e-07, + "loss": 0.3125, + "step": 5937 + }, + { + "epoch": 0.8328190743338009, + "grad_norm": 2.0490245452200084, + "learning_rate": 7.152296631652955e-07, + "loss": 0.3466, + "step": 5938 + }, + { + "epoch": 0.8329593267882188, + "grad_norm": 2.789434840889348, + "learning_rate": 7.140595195299921e-07, + "loss": 0.3455, + "step": 5939 + }, + { + "epoch": 0.8330995792426368, + "grad_norm": 2.6088231582509787, + "learning_rate": 7.128902602685617e-07, + "loss": 0.3696, + "step": 5940 + }, + { + "epoch": 0.8332398316970547, + "grad_norm": 2.2986714091720395, + "learning_rate": 7.117218856222741e-07, + "loss": 0.3017, + "step": 5941 + }, + { + "epoch": 0.8333800841514727, + "grad_norm": 2.6683109896135573, + "learning_rate": 7.105543958322154e-07, + "loss": 0.3158, + "step": 5942 + }, + { + "epoch": 0.8335203366058906, + "grad_norm": 1.557099661159203, + "learning_rate": 7.093877911392882e-07, + "loss": 0.3166, + "step": 5943 + }, + { + "epoch": 0.8336605890603086, + "grad_norm": 1.942696708528512, + "learning_rate": 7.082220717842137e-07, + "loss": 0.3142, + "step": 5944 + }, + { + "epoch": 0.8338008415147266, + "grad_norm": 1.9173665703774032, + "learning_rate": 7.070572380075302e-07, + "loss": 0.3275, + "step": 5945 + }, + { + "epoch": 0.8339410939691445, + "grad_norm": 1.93024180246546, + "learning_rate": 7.058932900495929e-07, + "loss": 0.3745, + "step": 5946 + }, + { + "epoch": 0.8340813464235625, + "grad_norm": 2.787778585565696, + "learning_rate": 7.047302281505735e-07, + "loss": 0.363, + "step": 5947 + }, + { + "epoch": 0.8342215988779804, + "grad_norm": 1.7713322852814406, + "learning_rate": 7.03568052550464e-07, + "loss": 0.3292, + "step": 5948 + }, + { + "epoch": 0.8343618513323983, + "grad_norm": 2.116686096502722, + "learning_rate": 7.024067634890686e-07, + "loss": 0.3577, + "step": 5949 + }, + { + "epoch": 0.8345021037868162, + "grad_norm": 2.8753280452494203, + "learning_rate": 7.012463612060122e-07, + "loss": 0.3252, + "step": 5950 + }, + { + "epoch": 0.8346423562412342, + "grad_norm": 2.0730617478812743, + "learning_rate": 7.000868459407357e-07, + "loss": 0.341, + "step": 5951 + }, + { + "epoch": 0.8347826086956521, + "grad_norm": 2.5781062269972286, + "learning_rate": 6.989282179324963e-07, + "loss": 0.338, + "step": 5952 + }, + { + "epoch": 0.8349228611500701, + "grad_norm": 1.896673128299176, + "learning_rate": 6.977704774203703e-07, + "loss": 0.3187, + "step": 5953 + }, + { + "epoch": 0.835063113604488, + "grad_norm": 1.585913897559515, + "learning_rate": 6.966136246432492e-07, + "loss": 0.3353, + "step": 5954 + }, + { + "epoch": 0.835203366058906, + "grad_norm": 2.496019914517064, + "learning_rate": 6.954576598398399e-07, + "loss": 0.3523, + "step": 5955 + }, + { + "epoch": 0.835343618513324, + "grad_norm": 2.017724060511966, + "learning_rate": 6.943025832486682e-07, + "loss": 0.3164, + "step": 5956 + }, + { + "epoch": 0.8354838709677419, + "grad_norm": 4.432005583528933, + "learning_rate": 6.931483951080769e-07, + "loss": 0.3534, + "step": 5957 + }, + { + "epoch": 0.8356241234221599, + "grad_norm": 1.5581541277951183, + "learning_rate": 6.919950956562244e-07, + "loss": 0.3022, + "step": 5958 + }, + { + "epoch": 0.8357643758765778, + "grad_norm": 1.772605936303927, + "learning_rate": 6.908426851310851e-07, + "loss": 0.3231, + "step": 5959 + }, + { + "epoch": 0.8359046283309958, + "grad_norm": 5.7808936756744735, + "learning_rate": 6.896911637704534e-07, + "loss": 0.2925, + "step": 5960 + }, + { + "epoch": 0.8360448807854137, + "grad_norm": 1.6853368104454178, + "learning_rate": 6.885405318119342e-07, + "loss": 0.3098, + "step": 5961 + }, + { + "epoch": 0.8361851332398317, + "grad_norm": 3.7081130030180924, + "learning_rate": 6.873907894929543e-07, + "loss": 0.3661, + "step": 5962 + }, + { + "epoch": 0.8363253856942496, + "grad_norm": 2.076226306391961, + "learning_rate": 6.862419370507545e-07, + "loss": 0.3236, + "step": 5963 + }, + { + "epoch": 0.8364656381486676, + "grad_norm": 1.8623008467940207, + "learning_rate": 6.850939747223928e-07, + "loss": 0.3675, + "step": 5964 + }, + { + "epoch": 0.8366058906030855, + "grad_norm": 1.93870445468513, + "learning_rate": 6.839469027447431e-07, + "loss": 0.3488, + "step": 5965 + }, + { + "epoch": 0.8367461430575035, + "grad_norm": 1.8637422195364273, + "learning_rate": 6.828007213544957e-07, + "loss": 0.3689, + "step": 5966 + }, + { + "epoch": 0.8368863955119215, + "grad_norm": 2.124894756628103, + "learning_rate": 6.816554307881574e-07, + "loss": 0.3196, + "step": 5967 + }, + { + "epoch": 0.8370266479663394, + "grad_norm": 4.484146590845727, + "learning_rate": 6.805110312820501e-07, + "loss": 0.3567, + "step": 5968 + }, + { + "epoch": 0.8371669004207574, + "grad_norm": 2.032548694660719, + "learning_rate": 6.793675230723145e-07, + "loss": 0.3294, + "step": 5969 + }, + { + "epoch": 0.8373071528751753, + "grad_norm": 1.621800353697337, + "learning_rate": 6.782249063949031e-07, + "loss": 0.3409, + "step": 5970 + }, + { + "epoch": 0.8374474053295933, + "grad_norm": 1.6870937564848512, + "learning_rate": 6.770831814855882e-07, + "loss": 0.3403, + "step": 5971 + }, + { + "epoch": 0.8375876577840112, + "grad_norm": 2.0616989892241713, + "learning_rate": 6.75942348579956e-07, + "loss": 0.3305, + "step": 5972 + }, + { + "epoch": 0.8377279102384292, + "grad_norm": 2.049175920231566, + "learning_rate": 6.748024079134102e-07, + "loss": 0.3531, + "step": 5973 + }, + { + "epoch": 0.8378681626928471, + "grad_norm": 2.2037848167676723, + "learning_rate": 6.736633597211706e-07, + "loss": 0.3244, + "step": 5974 + }, + { + "epoch": 0.8380084151472651, + "grad_norm": 2.3520586371216377, + "learning_rate": 6.725252042382691e-07, + "loss": 0.3406, + "step": 5975 + }, + { + "epoch": 0.838148667601683, + "grad_norm": 1.9766252228807857, + "learning_rate": 6.713879416995572e-07, + "loss": 0.3666, + "step": 5976 + }, + { + "epoch": 0.838288920056101, + "grad_norm": 2.012382679238463, + "learning_rate": 6.702515723397024e-07, + "loss": 0.3268, + "step": 5977 + }, + { + "epoch": 0.838429172510519, + "grad_norm": 2.3056053273702646, + "learning_rate": 6.691160963931848e-07, + "loss": 0.3311, + "step": 5978 + }, + { + "epoch": 0.8385694249649369, + "grad_norm": 1.7251773300797073, + "learning_rate": 6.67981514094303e-07, + "loss": 0.3414, + "step": 5979 + }, + { + "epoch": 0.8387096774193549, + "grad_norm": 2.131301736063884, + "learning_rate": 6.668478256771716e-07, + "loss": 0.3187, + "step": 5980 + }, + { + "epoch": 0.8388499298737728, + "grad_norm": 2.4760383508574724, + "learning_rate": 6.657150313757155e-07, + "loss": 0.3751, + "step": 5981 + }, + { + "epoch": 0.8389901823281908, + "grad_norm": 2.2141479260339847, + "learning_rate": 6.645831314236817e-07, + "loss": 0.3494, + "step": 5982 + }, + { + "epoch": 0.8391304347826087, + "grad_norm": 1.9183240510907655, + "learning_rate": 6.634521260546289e-07, + "loss": 0.2969, + "step": 5983 + }, + { + "epoch": 0.8392706872370267, + "grad_norm": 2.6188554146246297, + "learning_rate": 6.623220155019322e-07, + "loss": 0.3421, + "step": 5984 + }, + { + "epoch": 0.8394109396914446, + "grad_norm": 2.2504458515506918, + "learning_rate": 6.611927999987821e-07, + "loss": 0.39, + "step": 5985 + }, + { + "epoch": 0.8395511921458626, + "grad_norm": 1.8906382410515796, + "learning_rate": 6.600644797781847e-07, + "loss": 0.3552, + "step": 5986 + }, + { + "epoch": 0.8396914446002806, + "grad_norm": 3.430929528228952, + "learning_rate": 6.589370550729607e-07, + "loss": 0.3432, + "step": 5987 + }, + { + "epoch": 0.8398316970546985, + "grad_norm": 2.069211271094334, + "learning_rate": 6.578105261157464e-07, + "loss": 0.3252, + "step": 5988 + }, + { + "epoch": 0.8399719495091164, + "grad_norm": 2.2624390398949368, + "learning_rate": 6.566848931389935e-07, + "loss": 0.3826, + "step": 5989 + }, + { + "epoch": 0.8401122019635343, + "grad_norm": 2.1597756573062346, + "learning_rate": 6.555601563749675e-07, + "loss": 0.321, + "step": 5990 + }, + { + "epoch": 0.8402524544179523, + "grad_norm": 1.681750640637653, + "learning_rate": 6.54436316055751e-07, + "loss": 0.3177, + "step": 5991 + }, + { + "epoch": 0.8403927068723702, + "grad_norm": 2.0913650822141245, + "learning_rate": 6.533133724132396e-07, + "loss": 0.3432, + "step": 5992 + }, + { + "epoch": 0.8405329593267882, + "grad_norm": 1.9211184865779194, + "learning_rate": 6.521913256791457e-07, + "loss": 0.3443, + "step": 5993 + }, + { + "epoch": 0.8406732117812061, + "grad_norm": 2.02490699460296, + "learning_rate": 6.510701760849952e-07, + "loss": 0.3535, + "step": 5994 + }, + { + "epoch": 0.8408134642356241, + "grad_norm": 2.0128200442381976, + "learning_rate": 6.499499238621315e-07, + "loss": 0.3582, + "step": 5995 + }, + { + "epoch": 0.840953716690042, + "grad_norm": 2.044152232493797, + "learning_rate": 6.488305692417074e-07, + "loss": 0.3635, + "step": 5996 + }, + { + "epoch": 0.84109396914446, + "grad_norm": 2.027611158077854, + "learning_rate": 6.477121124546965e-07, + "loss": 0.3121, + "step": 5997 + }, + { + "epoch": 0.841234221598878, + "grad_norm": 1.6533406454098938, + "learning_rate": 6.46594553731883e-07, + "loss": 0.3346, + "step": 5998 + }, + { + "epoch": 0.8413744740532959, + "grad_norm": 2.2162560429197526, + "learning_rate": 6.454778933038681e-07, + "loss": 0.3244, + "step": 5999 + }, + { + "epoch": 0.8415147265077139, + "grad_norm": 2.014075459866966, + "learning_rate": 6.443621314010673e-07, + "loss": 0.3673, + "step": 6000 + }, + { + "epoch": 0.8416549789621318, + "grad_norm": 2.2366298155536892, + "learning_rate": 6.432472682537105e-07, + "loss": 0.3507, + "step": 6001 + }, + { + "epoch": 0.8417952314165498, + "grad_norm": 1.8507162486070836, + "learning_rate": 6.421333040918398e-07, + "loss": 0.3528, + "step": 6002 + }, + { + "epoch": 0.8419354838709677, + "grad_norm": 2.208053811094836, + "learning_rate": 6.410202391453157e-07, + "loss": 0.3512, + "step": 6003 + }, + { + "epoch": 0.8420757363253857, + "grad_norm": 2.6097733427703576, + "learning_rate": 6.399080736438113e-07, + "loss": 0.3787, + "step": 6004 + }, + { + "epoch": 0.8422159887798036, + "grad_norm": 2.5709288997188677, + "learning_rate": 6.387968078168133e-07, + "loss": 0.3308, + "step": 6005 + }, + { + "epoch": 0.8423562412342216, + "grad_norm": 1.6977866967506767, + "learning_rate": 6.376864418936246e-07, + "loss": 0.3189, + "step": 6006 + }, + { + "epoch": 0.8424964936886395, + "grad_norm": 2.069032899811632, + "learning_rate": 6.365769761033608e-07, + "loss": 0.3866, + "step": 6007 + }, + { + "epoch": 0.8426367461430575, + "grad_norm": 2.2906525072889536, + "learning_rate": 6.354684106749531e-07, + "loss": 0.298, + "step": 6008 + }, + { + "epoch": 0.8427769985974755, + "grad_norm": 2.8560573320911984, + "learning_rate": 6.343607458371459e-07, + "loss": 0.3232, + "step": 6009 + }, + { + "epoch": 0.8429172510518934, + "grad_norm": 1.7659643066618205, + "learning_rate": 6.332539818184985e-07, + "loss": 0.321, + "step": 6010 + }, + { + "epoch": 0.8430575035063114, + "grad_norm": 1.9057627821344394, + "learning_rate": 6.321481188473827e-07, + "loss": 0.3122, + "step": 6011 + }, + { + "epoch": 0.8431977559607293, + "grad_norm": 2.4928993476046317, + "learning_rate": 6.310431571519865e-07, + "loss": 0.326, + "step": 6012 + }, + { + "epoch": 0.8433380084151473, + "grad_norm": 2.427642648671933, + "learning_rate": 6.299390969603108e-07, + "loss": 0.3464, + "step": 6013 + }, + { + "epoch": 0.8434782608695652, + "grad_norm": 1.69797486125548, + "learning_rate": 6.288359385001702e-07, + "loss": 0.365, + "step": 6014 + }, + { + "epoch": 0.8436185133239832, + "grad_norm": 1.7153653196714806, + "learning_rate": 6.277336819991953e-07, + "loss": 0.343, + "step": 6015 + }, + { + "epoch": 0.8437587657784011, + "grad_norm": 3.257182767092917, + "learning_rate": 6.266323276848285e-07, + "loss": 0.3006, + "step": 6016 + }, + { + "epoch": 0.8438990182328191, + "grad_norm": 1.8505231942282456, + "learning_rate": 6.255318757843249e-07, + "loss": 0.31, + "step": 6017 + }, + { + "epoch": 0.844039270687237, + "grad_norm": 2.555817800233562, + "learning_rate": 6.244323265247565e-07, + "loss": 0.3776, + "step": 6018 + }, + { + "epoch": 0.844179523141655, + "grad_norm": 1.6185535797781898, + "learning_rate": 6.233336801330076e-07, + "loss": 0.3536, + "step": 6019 + }, + { + "epoch": 0.844319775596073, + "grad_norm": 1.8701322948353567, + "learning_rate": 6.222359368357761e-07, + "loss": 0.2935, + "step": 6020 + }, + { + "epoch": 0.8444600280504909, + "grad_norm": 1.95327245537127, + "learning_rate": 6.211390968595743e-07, + "loss": 0.3718, + "step": 6021 + }, + { + "epoch": 0.8446002805049089, + "grad_norm": 2.4527583729067683, + "learning_rate": 6.200431604307255e-07, + "loss": 0.3182, + "step": 6022 + }, + { + "epoch": 0.8447405329593268, + "grad_norm": 1.8037636113595021, + "learning_rate": 6.1894812777537e-07, + "loss": 0.3728, + "step": 6023 + }, + { + "epoch": 0.8448807854137448, + "grad_norm": 4.7849250581905665, + "learning_rate": 6.178539991194599e-07, + "loss": 0.3513, + "step": 6024 + }, + { + "epoch": 0.8450210378681627, + "grad_norm": 1.8427973080106381, + "learning_rate": 6.16760774688761e-07, + "loss": 0.3401, + "step": 6025 + }, + { + "epoch": 0.8451612903225807, + "grad_norm": 3.2215639424873546, + "learning_rate": 6.15668454708852e-07, + "loss": 0.338, + "step": 6026 + }, + { + "epoch": 0.8453015427769986, + "grad_norm": 1.5921298000673347, + "learning_rate": 6.145770394051265e-07, + "loss": 0.2723, + "step": 6027 + }, + { + "epoch": 0.8454417952314166, + "grad_norm": 1.686424124333253, + "learning_rate": 6.134865290027903e-07, + "loss": 0.364, + "step": 6028 + }, + { + "epoch": 0.8455820476858344, + "grad_norm": 1.679836887547496, + "learning_rate": 6.123969237268617e-07, + "loss": 0.3235, + "step": 6029 + }, + { + "epoch": 0.8457223001402524, + "grad_norm": 2.1749759190421134, + "learning_rate": 6.113082238021745e-07, + "loss": 0.3068, + "step": 6030 + }, + { + "epoch": 0.8458625525946704, + "grad_norm": 2.09664318548658, + "learning_rate": 6.102204294533731e-07, + "loss": 0.3522, + "step": 6031 + }, + { + "epoch": 0.8460028050490883, + "grad_norm": 2.6375446297724987, + "learning_rate": 6.091335409049159e-07, + "loss": 0.3367, + "step": 6032 + }, + { + "epoch": 0.8461430575035063, + "grad_norm": 1.773297879945007, + "learning_rate": 6.080475583810758e-07, + "loss": 0.2993, + "step": 6033 + }, + { + "epoch": 0.8462833099579242, + "grad_norm": 1.8628776321070017, + "learning_rate": 6.069624821059378e-07, + "loss": 0.3197, + "step": 6034 + }, + { + "epoch": 0.8464235624123422, + "grad_norm": 2.126443484841785, + "learning_rate": 6.05878312303399e-07, + "loss": 0.3677, + "step": 6035 + }, + { + "epoch": 0.8465638148667601, + "grad_norm": 2.0184801564992894, + "learning_rate": 6.04795049197171e-07, + "loss": 0.3416, + "step": 6036 + }, + { + "epoch": 0.8467040673211781, + "grad_norm": 2.523825411905077, + "learning_rate": 6.037126930107779e-07, + "loss": 0.309, + "step": 6037 + }, + { + "epoch": 0.846844319775596, + "grad_norm": 1.9578989507611473, + "learning_rate": 6.026312439675553e-07, + "loss": 0.3564, + "step": 6038 + }, + { + "epoch": 0.846984572230014, + "grad_norm": 1.7386849952464973, + "learning_rate": 6.015507022906525e-07, + "loss": 0.3048, + "step": 6039 + }, + { + "epoch": 0.847124824684432, + "grad_norm": 1.9307526031243454, + "learning_rate": 6.004710682030324e-07, + "loss": 0.3167, + "step": 6040 + }, + { + "epoch": 0.8472650771388499, + "grad_norm": 2.235520822658841, + "learning_rate": 5.993923419274699e-07, + "loss": 0.3311, + "step": 6041 + }, + { + "epoch": 0.8474053295932679, + "grad_norm": 1.7091770957183454, + "learning_rate": 5.983145236865534e-07, + "loss": 0.2649, + "step": 6042 + }, + { + "epoch": 0.8475455820476858, + "grad_norm": 2.554799675301453, + "learning_rate": 5.972376137026814e-07, + "loss": 0.3042, + "step": 6043 + }, + { + "epoch": 0.8476858345021038, + "grad_norm": 1.8978700142858804, + "learning_rate": 5.961616121980679e-07, + "loss": 0.3705, + "step": 6044 + }, + { + "epoch": 0.8478260869565217, + "grad_norm": 1.6428209413366355, + "learning_rate": 5.95086519394738e-07, + "loss": 0.2903, + "step": 6045 + }, + { + "epoch": 0.8479663394109397, + "grad_norm": 2.0745175823460476, + "learning_rate": 5.940123355145294e-07, + "loss": 0.3268, + "step": 6046 + }, + { + "epoch": 0.8481065918653576, + "grad_norm": 2.1057000820489065, + "learning_rate": 5.929390607790931e-07, + "loss": 0.3615, + "step": 6047 + }, + { + "epoch": 0.8482468443197756, + "grad_norm": 7.285221702956641, + "learning_rate": 5.918666954098912e-07, + "loss": 0.3378, + "step": 6048 + }, + { + "epoch": 0.8483870967741935, + "grad_norm": 2.6413442320652574, + "learning_rate": 5.90795239628199e-07, + "loss": 0.3087, + "step": 6049 + }, + { + "epoch": 0.8485273492286115, + "grad_norm": 2.1448385160966863, + "learning_rate": 5.897246936551043e-07, + "loss": 0.3478, + "step": 6050 + }, + { + "epoch": 0.8486676016830295, + "grad_norm": 1.8457792720527355, + "learning_rate": 5.886550577115069e-07, + "loss": 0.3427, + "step": 6051 + }, + { + "epoch": 0.8488078541374474, + "grad_norm": 2.1754995057498454, + "learning_rate": 5.875863320181175e-07, + "loss": 0.3215, + "step": 6052 + }, + { + "epoch": 0.8489481065918654, + "grad_norm": 1.709931308526679, + "learning_rate": 5.865185167954612e-07, + "loss": 0.3136, + "step": 6053 + }, + { + "epoch": 0.8490883590462833, + "grad_norm": 2.1225717920174256, + "learning_rate": 5.854516122638737e-07, + "loss": 0.3152, + "step": 6054 + }, + { + "epoch": 0.8492286115007013, + "grad_norm": 1.5578365924774313, + "learning_rate": 5.843856186435032e-07, + "loss": 0.2882, + "step": 6055 + }, + { + "epoch": 0.8493688639551192, + "grad_norm": 2.180186990365222, + "learning_rate": 5.833205361543109e-07, + "loss": 0.3369, + "step": 6056 + }, + { + "epoch": 0.8495091164095372, + "grad_norm": 2.224083306570469, + "learning_rate": 5.822563650160684e-07, + "loss": 0.3681, + "step": 6057 + }, + { + "epoch": 0.8496493688639551, + "grad_norm": 1.8469486615616075, + "learning_rate": 5.81193105448361e-07, + "loss": 0.2978, + "step": 6058 + }, + { + "epoch": 0.8497896213183731, + "grad_norm": 1.817914308028695, + "learning_rate": 5.801307576705833e-07, + "loss": 0.3455, + "step": 6059 + }, + { + "epoch": 0.8499298737727911, + "grad_norm": 1.8011416655873491, + "learning_rate": 5.790693219019439e-07, + "loss": 0.3161, + "step": 6060 + }, + { + "epoch": 0.850070126227209, + "grad_norm": 3.3997522376049125, + "learning_rate": 5.78008798361463e-07, + "loss": 0.3459, + "step": 6061 + }, + { + "epoch": 0.850210378681627, + "grad_norm": 2.194004741778331, + "learning_rate": 5.769491872679733e-07, + "loss": 0.3115, + "step": 6062 + }, + { + "epoch": 0.8503506311360449, + "grad_norm": 1.7685496995287888, + "learning_rate": 5.758904888401156e-07, + "loss": 0.2988, + "step": 6063 + }, + { + "epoch": 0.8504908835904629, + "grad_norm": 1.5764153917561585, + "learning_rate": 5.748327032963464e-07, + "loss": 0.3057, + "step": 6064 + }, + { + "epoch": 0.8506311360448808, + "grad_norm": 2.3427791364822306, + "learning_rate": 5.737758308549319e-07, + "loss": 0.3583, + "step": 6065 + }, + { + "epoch": 0.8507713884992988, + "grad_norm": 1.9202509275021205, + "learning_rate": 5.727198717339511e-07, + "loss": 0.3273, + "step": 6066 + }, + { + "epoch": 0.8509116409537167, + "grad_norm": 3.282592063013716, + "learning_rate": 5.716648261512931e-07, + "loss": 0.2924, + "step": 6067 + }, + { + "epoch": 0.8510518934081347, + "grad_norm": 2.0339465509029315, + "learning_rate": 5.706106943246592e-07, + "loss": 0.3325, + "step": 6068 + }, + { + "epoch": 0.8511921458625525, + "grad_norm": 3.183092548258119, + "learning_rate": 5.695574764715628e-07, + "loss": 0.3515, + "step": 6069 + }, + { + "epoch": 0.8513323983169705, + "grad_norm": 1.963248118778689, + "learning_rate": 5.685051728093271e-07, + "loss": 0.3292, + "step": 6070 + }, + { + "epoch": 0.8514726507713885, + "grad_norm": 2.1576682692380924, + "learning_rate": 5.674537835550897e-07, + "loss": 0.3739, + "step": 6071 + }, + { + "epoch": 0.8516129032258064, + "grad_norm": 1.607726428635446, + "learning_rate": 5.664033089257948e-07, + "loss": 0.2824, + "step": 6072 + }, + { + "epoch": 0.8517531556802244, + "grad_norm": 1.8666605824416778, + "learning_rate": 5.653537491382011e-07, + "loss": 0.3571, + "step": 6073 + }, + { + "epoch": 0.8518934081346423, + "grad_norm": 16.54785891049003, + "learning_rate": 5.643051044088787e-07, + "loss": 0.3395, + "step": 6074 + }, + { + "epoch": 0.8520336605890603, + "grad_norm": 1.797449567402985, + "learning_rate": 5.632573749542075e-07, + "loss": 0.3377, + "step": 6075 + }, + { + "epoch": 0.8521739130434782, + "grad_norm": 1.9316008190531682, + "learning_rate": 5.622105609903794e-07, + "loss": 0.3547, + "step": 6076 + }, + { + "epoch": 0.8523141654978962, + "grad_norm": 2.062292146323976, + "learning_rate": 5.611646627333977e-07, + "loss": 0.3243, + "step": 6077 + }, + { + "epoch": 0.8524544179523141, + "grad_norm": 1.772400405732911, + "learning_rate": 5.601196803990755e-07, + "loss": 0.3582, + "step": 6078 + }, + { + "epoch": 0.8525946704067321, + "grad_norm": 1.5540623773896571, + "learning_rate": 5.590756142030385e-07, + "loss": 0.2533, + "step": 6079 + }, + { + "epoch": 0.85273492286115, + "grad_norm": 1.647772777012749, + "learning_rate": 5.58032464360721e-07, + "loss": 0.3481, + "step": 6080 + }, + { + "epoch": 0.852875175315568, + "grad_norm": 2.8274269717876424, + "learning_rate": 5.569902310873703e-07, + "loss": 0.3354, + "step": 6081 + }, + { + "epoch": 0.853015427769986, + "grad_norm": 1.6888542392728894, + "learning_rate": 5.55948914598044e-07, + "loss": 0.3287, + "step": 6082 + }, + { + "epoch": 0.8531556802244039, + "grad_norm": 1.4950542773086866, + "learning_rate": 5.549085151076122e-07, + "loss": 0.3234, + "step": 6083 + }, + { + "epoch": 0.8532959326788219, + "grad_norm": 2.2691632233405725, + "learning_rate": 5.538690328307505e-07, + "loss": 0.3874, + "step": 6084 + }, + { + "epoch": 0.8534361851332398, + "grad_norm": 1.7555042624860497, + "learning_rate": 5.528304679819513e-07, + "loss": 0.3295, + "step": 6085 + }, + { + "epoch": 0.8535764375876578, + "grad_norm": 2.047387444799126, + "learning_rate": 5.517928207755146e-07, + "loss": 0.3006, + "step": 6086 + }, + { + "epoch": 0.8537166900420757, + "grad_norm": 2.3183178275289342, + "learning_rate": 5.507560914255516e-07, + "loss": 0.3289, + "step": 6087 + }, + { + "epoch": 0.8538569424964937, + "grad_norm": 2.027587523164664, + "learning_rate": 5.497202801459844e-07, + "loss": 0.3712, + "step": 6088 + }, + { + "epoch": 0.8539971949509116, + "grad_norm": 2.0034693191422077, + "learning_rate": 5.486853871505455e-07, + "loss": 0.3455, + "step": 6089 + }, + { + "epoch": 0.8541374474053296, + "grad_norm": 1.5420972463369598, + "learning_rate": 5.476514126527771e-07, + "loss": 0.3511, + "step": 6090 + }, + { + "epoch": 0.8542776998597476, + "grad_norm": 1.7251298579113123, + "learning_rate": 5.466183568660332e-07, + "loss": 0.2941, + "step": 6091 + }, + { + "epoch": 0.8544179523141655, + "grad_norm": 2.008034779160064, + "learning_rate": 5.45586220003479e-07, + "loss": 0.3602, + "step": 6092 + }, + { + "epoch": 0.8545582047685835, + "grad_norm": 4.738031467228583, + "learning_rate": 5.44555002278086e-07, + "loss": 0.3157, + "step": 6093 + }, + { + "epoch": 0.8546984572230014, + "grad_norm": 1.9561062452251008, + "learning_rate": 5.435247039026398e-07, + "loss": 0.3759, + "step": 6094 + }, + { + "epoch": 0.8548387096774194, + "grad_norm": 1.8167542483131938, + "learning_rate": 5.424953250897358e-07, + "loss": 0.2973, + "step": 6095 + }, + { + "epoch": 0.8549789621318373, + "grad_norm": 1.62523744755668, + "learning_rate": 5.414668660517791e-07, + "loss": 0.3296, + "step": 6096 + }, + { + "epoch": 0.8551192145862553, + "grad_norm": 1.8130455511465304, + "learning_rate": 5.404393270009844e-07, + "loss": 0.3309, + "step": 6097 + }, + { + "epoch": 0.8552594670406732, + "grad_norm": 2.4729313268015383, + "learning_rate": 5.394127081493783e-07, + "loss": 0.2892, + "step": 6098 + }, + { + "epoch": 0.8553997194950912, + "grad_norm": 1.9803118822320447, + "learning_rate": 5.383870097087962e-07, + "loss": 0.306, + "step": 6099 + }, + { + "epoch": 0.8555399719495091, + "grad_norm": 1.6516971991546492, + "learning_rate": 5.373622318908822e-07, + "loss": 0.3312, + "step": 6100 + }, + { + "epoch": 0.8556802244039271, + "grad_norm": 1.9901851988642159, + "learning_rate": 5.363383749070939e-07, + "loss": 0.3639, + "step": 6101 + }, + { + "epoch": 0.8558204768583451, + "grad_norm": 1.9386208891289105, + "learning_rate": 5.353154389686954e-07, + "loss": 0.3329, + "step": 6102 + }, + { + "epoch": 0.855960729312763, + "grad_norm": 2.8572910406649954, + "learning_rate": 5.342934242867648e-07, + "loss": 0.3147, + "step": 6103 + }, + { + "epoch": 0.856100981767181, + "grad_norm": 1.8968582177715962, + "learning_rate": 5.332723310721855e-07, + "loss": 0.3423, + "step": 6104 + }, + { + "epoch": 0.8562412342215989, + "grad_norm": 1.9959974769622206, + "learning_rate": 5.322521595356533e-07, + "loss": 0.39, + "step": 6105 + }, + { + "epoch": 0.8563814866760169, + "grad_norm": 1.6576896744223273, + "learning_rate": 5.312329098876734e-07, + "loss": 0.3099, + "step": 6106 + }, + { + "epoch": 0.8565217391304348, + "grad_norm": 3.028156865348899, + "learning_rate": 5.302145823385618e-07, + "loss": 0.3125, + "step": 6107 + }, + { + "epoch": 0.8566619915848528, + "grad_norm": 1.6721440749356884, + "learning_rate": 5.291971770984428e-07, + "loss": 0.3083, + "step": 6108 + }, + { + "epoch": 0.8568022440392706, + "grad_norm": 1.5664244386753763, + "learning_rate": 5.281806943772505e-07, + "loss": 0.3218, + "step": 6109 + }, + { + "epoch": 0.8569424964936886, + "grad_norm": 2.001829026260339, + "learning_rate": 5.271651343847295e-07, + "loss": 0.3173, + "step": 6110 + }, + { + "epoch": 0.8570827489481065, + "grad_norm": 1.7406645135144216, + "learning_rate": 5.261504973304332e-07, + "loss": 0.3605, + "step": 6111 + }, + { + "epoch": 0.8572230014025245, + "grad_norm": 1.949301677766893, + "learning_rate": 5.251367834237264e-07, + "loss": 0.3331, + "step": 6112 + }, + { + "epoch": 0.8573632538569425, + "grad_norm": 1.6607421559164777, + "learning_rate": 5.241239928737791e-07, + "loss": 0.3137, + "step": 6113 + }, + { + "epoch": 0.8575035063113604, + "grad_norm": 2.658054143022672, + "learning_rate": 5.231121258895749e-07, + "loss": 0.3229, + "step": 6114 + }, + { + "epoch": 0.8576437587657784, + "grad_norm": 2.220127376648124, + "learning_rate": 5.221011826799055e-07, + "loss": 0.3296, + "step": 6115 + }, + { + "epoch": 0.8577840112201963, + "grad_norm": 1.9048314679306597, + "learning_rate": 5.210911634533722e-07, + "loss": 0.3558, + "step": 6116 + }, + { + "epoch": 0.8579242636746143, + "grad_norm": 1.9844102030357418, + "learning_rate": 5.200820684183849e-07, + "loss": 0.3434, + "step": 6117 + }, + { + "epoch": 0.8580645161290322, + "grad_norm": 2.5911214907219993, + "learning_rate": 5.190738977831638e-07, + "loss": 0.4046, + "step": 6118 + }, + { + "epoch": 0.8582047685834502, + "grad_norm": 1.9521554600416147, + "learning_rate": 5.180666517557375e-07, + "loss": 0.2708, + "step": 6119 + }, + { + "epoch": 0.8583450210378681, + "grad_norm": 2.1307050752237457, + "learning_rate": 5.170603305439454e-07, + "loss": 0.3149, + "step": 6120 + }, + { + "epoch": 0.8584852734922861, + "grad_norm": 1.6818998449355316, + "learning_rate": 5.160549343554327e-07, + "loss": 0.3407, + "step": 6121 + }, + { + "epoch": 0.858625525946704, + "grad_norm": 2.355537476810119, + "learning_rate": 5.150504633976572e-07, + "loss": 0.3513, + "step": 6122 + }, + { + "epoch": 0.858765778401122, + "grad_norm": 2.3363586862590586, + "learning_rate": 5.140469178778845e-07, + "loss": 0.3245, + "step": 6123 + }, + { + "epoch": 0.85890603085554, + "grad_norm": 1.600906163150386, + "learning_rate": 5.130442980031892e-07, + "loss": 0.3271, + "step": 6124 + }, + { + "epoch": 0.8590462833099579, + "grad_norm": 1.747766977416793, + "learning_rate": 5.120426039804544e-07, + "loss": 0.3233, + "step": 6125 + }, + { + "epoch": 0.8591865357643759, + "grad_norm": 1.8807559356291546, + "learning_rate": 5.110418360163722e-07, + "loss": 0.3149, + "step": 6126 + }, + { + "epoch": 0.8593267882187938, + "grad_norm": 1.7970212793931848, + "learning_rate": 5.10041994317445e-07, + "loss": 0.3228, + "step": 6127 + }, + { + "epoch": 0.8594670406732118, + "grad_norm": 1.8260132419820505, + "learning_rate": 5.090430790899836e-07, + "loss": 0.3549, + "step": 6128 + }, + { + "epoch": 0.8596072931276297, + "grad_norm": 1.9466884909455928, + "learning_rate": 5.080450905401057e-07, + "loss": 0.2916, + "step": 6129 + }, + { + "epoch": 0.8597475455820477, + "grad_norm": 2.239626814545811, + "learning_rate": 5.070480288737406e-07, + "loss": 0.3428, + "step": 6130 + }, + { + "epoch": 0.8598877980364656, + "grad_norm": 1.8719173522131916, + "learning_rate": 5.060518942966242e-07, + "loss": 0.3085, + "step": 6131 + }, + { + "epoch": 0.8600280504908836, + "grad_norm": 2.6332475885684232, + "learning_rate": 5.050566870143025e-07, + "loss": 0.3674, + "step": 6132 + }, + { + "epoch": 0.8601683029453016, + "grad_norm": 2.0176389819824037, + "learning_rate": 5.040624072321299e-07, + "loss": 0.3213, + "step": 6133 + }, + { + "epoch": 0.8603085553997195, + "grad_norm": 2.0443898030709886, + "learning_rate": 5.030690551552675e-07, + "loss": 0.3264, + "step": 6134 + }, + { + "epoch": 0.8604488078541375, + "grad_norm": 2.481831718303744, + "learning_rate": 5.020766309886876e-07, + "loss": 0.3135, + "step": 6135 + }, + { + "epoch": 0.8605890603085554, + "grad_norm": 1.803464532946029, + "learning_rate": 5.010851349371704e-07, + "loss": 0.3321, + "step": 6136 + }, + { + "epoch": 0.8607293127629734, + "grad_norm": 1.6609170763513093, + "learning_rate": 5.000945672053032e-07, + "loss": 0.3657, + "step": 6137 + }, + { + "epoch": 0.8608695652173913, + "grad_norm": 1.8111355348252078, + "learning_rate": 4.99104927997483e-07, + "loss": 0.2851, + "step": 6138 + }, + { + "epoch": 0.8610098176718093, + "grad_norm": 2.0842943899418027, + "learning_rate": 4.981162175179155e-07, + "loss": 0.2894, + "step": 6139 + }, + { + "epoch": 0.8611500701262272, + "grad_norm": 2.067301968889042, + "learning_rate": 4.971284359706141e-07, + "loss": 0.3055, + "step": 6140 + }, + { + "epoch": 0.8612903225806452, + "grad_norm": 2.018817354856072, + "learning_rate": 4.961415835594007e-07, + "loss": 0.3269, + "step": 6141 + }, + { + "epoch": 0.8614305750350632, + "grad_norm": 2.081729945841518, + "learning_rate": 4.951556604879049e-07, + "loss": 0.3516, + "step": 6142 + }, + { + "epoch": 0.8615708274894811, + "grad_norm": 1.8779569833153424, + "learning_rate": 4.941706669595647e-07, + "loss": 0.2899, + "step": 6143 + }, + { + "epoch": 0.8617110799438991, + "grad_norm": 1.969667372336686, + "learning_rate": 4.931866031776283e-07, + "loss": 0.3436, + "step": 6144 + }, + { + "epoch": 0.861851332398317, + "grad_norm": 1.7094670774776866, + "learning_rate": 4.922034693451483e-07, + "loss": 0.318, + "step": 6145 + }, + { + "epoch": 0.861991584852735, + "grad_norm": 2.304611773782397, + "learning_rate": 4.912212656649879e-07, + "loss": 0.3665, + "step": 6146 + }, + { + "epoch": 0.8621318373071529, + "grad_norm": 1.6669800304825453, + "learning_rate": 4.902399923398193e-07, + "loss": 0.3475, + "step": 6147 + }, + { + "epoch": 0.8622720897615709, + "grad_norm": 1.8138926844665062, + "learning_rate": 4.892596495721202e-07, + "loss": 0.3295, + "step": 6148 + }, + { + "epoch": 0.8624123422159887, + "grad_norm": 1.9394263258942779, + "learning_rate": 4.882802375641777e-07, + "loss": 0.331, + "step": 6149 + }, + { + "epoch": 0.8625525946704067, + "grad_norm": 1.7434561177565802, + "learning_rate": 4.873017565180871e-07, + "loss": 0.3426, + "step": 6150 + }, + { + "epoch": 0.8626928471248246, + "grad_norm": 1.9122465900755303, + "learning_rate": 4.86324206635751e-07, + "loss": 0.3014, + "step": 6151 + }, + { + "epoch": 0.8628330995792426, + "grad_norm": 1.9520255694668907, + "learning_rate": 4.853475881188796e-07, + "loss": 0.3149, + "step": 6152 + }, + { + "epoch": 0.8629733520336605, + "grad_norm": 2.021487441498278, + "learning_rate": 4.843719011689924e-07, + "loss": 0.3633, + "step": 6153 + }, + { + "epoch": 0.8631136044880785, + "grad_norm": 2.0556805317778513, + "learning_rate": 4.833971459874137e-07, + "loss": 0.3818, + "step": 6154 + }, + { + "epoch": 0.8632538569424965, + "grad_norm": 1.9207956027657835, + "learning_rate": 4.824233227752789e-07, + "loss": 0.3737, + "step": 6155 + }, + { + "epoch": 0.8633941093969144, + "grad_norm": 2.6620565340615046, + "learning_rate": 4.814504317335289e-07, + "loss": 0.3294, + "step": 6156 + }, + { + "epoch": 0.8635343618513324, + "grad_norm": 2.2018176589503002, + "learning_rate": 4.804784730629131e-07, + "loss": 0.3508, + "step": 6157 + }, + { + "epoch": 0.8636746143057503, + "grad_norm": 2.0023934824375305, + "learning_rate": 4.795074469639888e-07, + "loss": 0.3548, + "step": 6158 + }, + { + "epoch": 0.8638148667601683, + "grad_norm": 1.695598103597831, + "learning_rate": 4.785373536371196e-07, + "loss": 0.3422, + "step": 6159 + }, + { + "epoch": 0.8639551192145862, + "grad_norm": 2.0864526919667816, + "learning_rate": 4.775681932824783e-07, + "loss": 0.3122, + "step": 6160 + }, + { + "epoch": 0.8640953716690042, + "grad_norm": 2.016306089317565, + "learning_rate": 4.7659996610004423e-07, + "loss": 0.3933, + "step": 6161 + }, + { + "epoch": 0.8642356241234221, + "grad_norm": 1.6900118041895953, + "learning_rate": 4.756326722896054e-07, + "loss": 0.3496, + "step": 6162 + }, + { + "epoch": 0.8643758765778401, + "grad_norm": 5.151620902371692, + "learning_rate": 4.7466631205075333e-07, + "loss": 0.3196, + "step": 6163 + }, + { + "epoch": 0.864516129032258, + "grad_norm": 1.7722064883863387, + "learning_rate": 4.7370088558289175e-07, + "loss": 0.3503, + "step": 6164 + }, + { + "epoch": 0.864656381486676, + "grad_norm": 1.622432440951444, + "learning_rate": 4.7273639308523023e-07, + "loss": 0.313, + "step": 6165 + }, + { + "epoch": 0.864796633941094, + "grad_norm": 2.2409615704136474, + "learning_rate": 4.717728347567829e-07, + "loss": 0.3461, + "step": 6166 + }, + { + "epoch": 0.8649368863955119, + "grad_norm": 1.757736407858271, + "learning_rate": 4.708102107963741e-07, + "loss": 0.2997, + "step": 6167 + }, + { + "epoch": 0.8650771388499299, + "grad_norm": 1.9983383667123933, + "learning_rate": 4.698485214026349e-07, + "loss": 0.3834, + "step": 6168 + }, + { + "epoch": 0.8652173913043478, + "grad_norm": 3.1373323859318596, + "learning_rate": 4.6888776677400384e-07, + "loss": 0.3208, + "step": 6169 + }, + { + "epoch": 0.8653576437587658, + "grad_norm": 2.352382073522565, + "learning_rate": 4.6792794710872446e-07, + "loss": 0.3235, + "step": 6170 + }, + { + "epoch": 0.8654978962131837, + "grad_norm": 3.1639075724029904, + "learning_rate": 4.6696906260485007e-07, + "loss": 0.3512, + "step": 6171 + }, + { + "epoch": 0.8656381486676017, + "grad_norm": 1.9639443980346738, + "learning_rate": 4.6601111346023963e-07, + "loss": 0.3574, + "step": 6172 + }, + { + "epoch": 0.8657784011220196, + "grad_norm": 1.892510789868173, + "learning_rate": 4.6505409987255833e-07, + "loss": 0.2837, + "step": 6173 + }, + { + "epoch": 0.8659186535764376, + "grad_norm": 2.603573500153803, + "learning_rate": 4.64098022039281e-07, + "loss": 0.3226, + "step": 6174 + }, + { + "epoch": 0.8660589060308556, + "grad_norm": 1.6795884214056205, + "learning_rate": 4.6314288015768595e-07, + "loss": 0.3549, + "step": 6175 + }, + { + "epoch": 0.8661991584852735, + "grad_norm": 1.9386218729974738, + "learning_rate": 4.621886744248605e-07, + "loss": 0.2914, + "step": 6176 + }, + { + "epoch": 0.8663394109396915, + "grad_norm": 2.5953144732610745, + "learning_rate": 4.612354050376977e-07, + "loss": 0.3244, + "step": 6177 + }, + { + "epoch": 0.8664796633941094, + "grad_norm": 1.8586149826215994, + "learning_rate": 4.602830721928997e-07, + "loss": 0.3248, + "step": 6178 + }, + { + "epoch": 0.8666199158485274, + "grad_norm": 1.9694344677991502, + "learning_rate": 4.5933167608697204e-07, + "loss": 0.357, + "step": 6179 + }, + { + "epoch": 0.8667601683029453, + "grad_norm": 1.7994479127443317, + "learning_rate": 4.5838121691622995e-07, + "loss": 0.3546, + "step": 6180 + }, + { + "epoch": 0.8669004207573633, + "grad_norm": 1.6335047664333426, + "learning_rate": 4.574316948767932e-07, + "loss": 0.3622, + "step": 6181 + }, + { + "epoch": 0.8670406732117812, + "grad_norm": 2.1867567707121363, + "learning_rate": 4.5648311016458943e-07, + "loss": 0.3325, + "step": 6182 + }, + { + "epoch": 0.8671809256661992, + "grad_norm": 1.6769049729303462, + "learning_rate": 4.555354629753533e-07, + "loss": 0.3434, + "step": 6183 + }, + { + "epoch": 0.8673211781206172, + "grad_norm": 1.623600210162833, + "learning_rate": 4.545887535046228e-07, + "loss": 0.3277, + "step": 6184 + }, + { + "epoch": 0.8674614305750351, + "grad_norm": 2.259558296418964, + "learning_rate": 4.536429819477478e-07, + "loss": 0.3524, + "step": 6185 + }, + { + "epoch": 0.8676016830294531, + "grad_norm": 1.7577016837899422, + "learning_rate": 4.526981484998788e-07, + "loss": 0.3456, + "step": 6186 + }, + { + "epoch": 0.867741935483871, + "grad_norm": 2.0092064438242847, + "learning_rate": 4.517542533559771e-07, + "loss": 0.3278, + "step": 6187 + }, + { + "epoch": 0.867882187938289, + "grad_norm": 2.0981223158695617, + "learning_rate": 4.508112967108091e-07, + "loss": 0.3477, + "step": 6188 + }, + { + "epoch": 0.8680224403927068, + "grad_norm": 2.3139637618858253, + "learning_rate": 4.4986927875894646e-07, + "loss": 0.3905, + "step": 6189 + }, + { + "epoch": 0.8681626928471248, + "grad_norm": 1.8357978930048868, + "learning_rate": 4.489281996947681e-07, + "loss": 0.3591, + "step": 6190 + }, + { + "epoch": 0.8683029453015427, + "grad_norm": 1.7530442054340942, + "learning_rate": 4.479880597124597e-07, + "loss": 0.315, + "step": 6191 + }, + { + "epoch": 0.8684431977559607, + "grad_norm": 1.9814538555985741, + "learning_rate": 4.4704885900601236e-07, + "loss": 0.3456, + "step": 6192 + }, + { + "epoch": 0.8685834502103786, + "grad_norm": 2.1454821425765793, + "learning_rate": 4.461105977692237e-07, + "loss": 0.3248, + "step": 6193 + }, + { + "epoch": 0.8687237026647966, + "grad_norm": 2.2922351709985165, + "learning_rate": 4.4517327619569784e-07, + "loss": 0.3454, + "step": 6194 + }, + { + "epoch": 0.8688639551192145, + "grad_norm": 1.8228991553055858, + "learning_rate": 4.442368944788428e-07, + "loss": 0.3862, + "step": 6195 + }, + { + "epoch": 0.8690042075736325, + "grad_norm": 2.088126873071465, + "learning_rate": 4.4330145281187566e-07, + "loss": 0.3468, + "step": 6196 + }, + { + "epoch": 0.8691444600280505, + "grad_norm": 2.232995200774267, + "learning_rate": 4.423669513878182e-07, + "loss": 0.3752, + "step": 6197 + }, + { + "epoch": 0.8692847124824684, + "grad_norm": 1.8777537151854016, + "learning_rate": 4.414333903994983e-07, + "loss": 0.3461, + "step": 6198 + }, + { + "epoch": 0.8694249649368864, + "grad_norm": 2.2544860622967557, + "learning_rate": 4.405007700395497e-07, + "loss": 0.3425, + "step": 6199 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 2.545593220011076, + "learning_rate": 4.3956909050041186e-07, + "loss": 0.341, + "step": 6200 + }, + { + "epoch": 0.8697054698457223, + "grad_norm": 1.9872264649838791, + "learning_rate": 4.3863835197433037e-07, + "loss": 0.3123, + "step": 6201 + }, + { + "epoch": 0.8698457223001402, + "grad_norm": 2.4011734755960483, + "learning_rate": 4.377085546533566e-07, + "loss": 0.3173, + "step": 6202 + }, + { + "epoch": 0.8699859747545582, + "grad_norm": 2.4892906644740282, + "learning_rate": 4.3677969872934824e-07, + "loss": 0.378, + "step": 6203 + }, + { + "epoch": 0.8701262272089761, + "grad_norm": 1.9302509427364138, + "learning_rate": 4.3585178439396856e-07, + "loss": 0.3327, + "step": 6204 + }, + { + "epoch": 0.8702664796633941, + "grad_norm": 2.1926549933704558, + "learning_rate": 4.349248118386851e-07, + "loss": 0.2987, + "step": 6205 + }, + { + "epoch": 0.870406732117812, + "grad_norm": 2.2739894760545907, + "learning_rate": 4.33998781254773e-07, + "loss": 0.3171, + "step": 6206 + }, + { + "epoch": 0.87054698457223, + "grad_norm": 1.9066204273957095, + "learning_rate": 4.330736928333107e-07, + "loss": 0.3185, + "step": 6207 + }, + { + "epoch": 0.870687237026648, + "grad_norm": 1.9608341508964438, + "learning_rate": 4.321495467651854e-07, + "loss": 0.3559, + "step": 6208 + }, + { + "epoch": 0.8708274894810659, + "grad_norm": 1.653961616061505, + "learning_rate": 4.312263432410868e-07, + "loss": 0.3138, + "step": 6209 + }, + { + "epoch": 0.8709677419354839, + "grad_norm": 2.027096066201325, + "learning_rate": 4.303040824515131e-07, + "loss": 0.3411, + "step": 6210 + }, + { + "epoch": 0.8711079943899018, + "grad_norm": 2.1758933457925744, + "learning_rate": 4.293827645867649e-07, + "loss": 0.3149, + "step": 6211 + }, + { + "epoch": 0.8712482468443198, + "grad_norm": 2.0769198946099747, + "learning_rate": 4.284623898369511e-07, + "loss": 0.31, + "step": 6212 + }, + { + "epoch": 0.8713884992987377, + "grad_norm": 1.7664437885167765, + "learning_rate": 4.2754295839198325e-07, + "loss": 0.3337, + "step": 6213 + }, + { + "epoch": 0.8715287517531557, + "grad_norm": 1.7721961293879005, + "learning_rate": 4.266244704415806e-07, + "loss": 0.3719, + "step": 6214 + }, + { + "epoch": 0.8716690042075736, + "grad_norm": 2.597400440441202, + "learning_rate": 4.2570692617526667e-07, + "loss": 0.3565, + "step": 6215 + }, + { + "epoch": 0.8718092566619916, + "grad_norm": 1.953520833912209, + "learning_rate": 4.2479032578236934e-07, + "loss": 0.3463, + "step": 6216 + }, + { + "epoch": 0.8719495091164096, + "grad_norm": 1.7699329854919152, + "learning_rate": 4.2387466945202347e-07, + "loss": 0.3422, + "step": 6217 + }, + { + "epoch": 0.8720897615708275, + "grad_norm": 1.8870084867867771, + "learning_rate": 4.2295995737316854e-07, + "loss": 0.3173, + "step": 6218 + }, + { + "epoch": 0.8722300140252455, + "grad_norm": 2.5841188057021762, + "learning_rate": 4.220461897345485e-07, + "loss": 0.31, + "step": 6219 + }, + { + "epoch": 0.8723702664796634, + "grad_norm": 2.425071231543078, + "learning_rate": 4.211333667247125e-07, + "loss": 0.366, + "step": 6220 + }, + { + "epoch": 0.8725105189340814, + "grad_norm": 2.653631198432156, + "learning_rate": 4.202214885320166e-07, + "loss": 0.3219, + "step": 6221 + }, + { + "epoch": 0.8726507713884993, + "grad_norm": 1.9855591252211922, + "learning_rate": 4.193105553446192e-07, + "loss": 0.3601, + "step": 6222 + }, + { + "epoch": 0.8727910238429173, + "grad_norm": 1.973098191715661, + "learning_rate": 4.184005673504854e-07, + "loss": 0.3467, + "step": 6223 + }, + { + "epoch": 0.8729312762973352, + "grad_norm": 1.5475179703859994, + "learning_rate": 4.174915247373862e-07, + "loss": 0.2777, + "step": 6224 + }, + { + "epoch": 0.8730715287517532, + "grad_norm": 2.216656409027153, + "learning_rate": 4.1658342769289374e-07, + "loss": 0.3515, + "step": 6225 + }, + { + "epoch": 0.8732117812061712, + "grad_norm": 1.7922433619951623, + "learning_rate": 4.156762764043898e-07, + "loss": 0.2931, + "step": 6226 + }, + { + "epoch": 0.8733520336605891, + "grad_norm": 2.1849540063133115, + "learning_rate": 4.147700710590563e-07, + "loss": 0.3102, + "step": 6227 + }, + { + "epoch": 0.8734922861150071, + "grad_norm": 2.1353028678034778, + "learning_rate": 4.1386481184388427e-07, + "loss": 0.345, + "step": 6228 + }, + { + "epoch": 0.8736325385694249, + "grad_norm": 1.9927665317755603, + "learning_rate": 4.1296049894566646e-07, + "loss": 0.3132, + "step": 6229 + }, + { + "epoch": 0.8737727910238429, + "grad_norm": 1.9397185759577098, + "learning_rate": 4.1205713255100253e-07, + "loss": 0.3408, + "step": 6230 + }, + { + "epoch": 0.8739130434782608, + "grad_norm": 2.2291248234419507, + "learning_rate": 4.1115471284629504e-07, + "loss": 0.3152, + "step": 6231 + }, + { + "epoch": 0.8740532959326788, + "grad_norm": 1.6399069531724892, + "learning_rate": 4.102532400177528e-07, + "loss": 0.3261, + "step": 6232 + }, + { + "epoch": 0.8741935483870967, + "grad_norm": 2.301290199968847, + "learning_rate": 4.0935271425138757e-07, + "loss": 0.3632, + "step": 6233 + }, + { + "epoch": 0.8743338008415147, + "grad_norm": 2.089099444887941, + "learning_rate": 4.0845313573301736e-07, + "loss": 0.3338, + "step": 6234 + }, + { + "epoch": 0.8744740532959326, + "grad_norm": 1.9892611084556335, + "learning_rate": 4.0755450464826375e-07, + "loss": 0.3539, + "step": 6235 + }, + { + "epoch": 0.8746143057503506, + "grad_norm": 1.961349808710393, + "learning_rate": 4.0665682118255225e-07, + "loss": 0.3653, + "step": 6236 + }, + { + "epoch": 0.8747545582047686, + "grad_norm": 2.0523057484675307, + "learning_rate": 4.0576008552111414e-07, + "loss": 0.3138, + "step": 6237 + }, + { + "epoch": 0.8748948106591865, + "grad_norm": 1.8497758446032617, + "learning_rate": 4.048642978489842e-07, + "loss": 0.3208, + "step": 6238 + }, + { + "epoch": 0.8750350631136045, + "grad_norm": 1.6028257272836108, + "learning_rate": 4.0396945835100286e-07, + "loss": 0.3179, + "step": 6239 + }, + { + "epoch": 0.8751753155680224, + "grad_norm": 3.575344821332715, + "learning_rate": 4.030755672118125e-07, + "loss": 0.3289, + "step": 6240 + }, + { + "epoch": 0.8753155680224404, + "grad_norm": 1.8369215904783278, + "learning_rate": 4.021826246158628e-07, + "loss": 0.3171, + "step": 6241 + }, + { + "epoch": 0.8754558204768583, + "grad_norm": 2.408774500723742, + "learning_rate": 4.012906307474057e-07, + "loss": 0.314, + "step": 6242 + }, + { + "epoch": 0.8755960729312763, + "grad_norm": 2.7633347160725497, + "learning_rate": 4.003995857904974e-07, + "loss": 0.2717, + "step": 6243 + }, + { + "epoch": 0.8757363253856942, + "grad_norm": 1.874794885224466, + "learning_rate": 3.9950948992899917e-07, + "loss": 0.2934, + "step": 6244 + }, + { + "epoch": 0.8758765778401122, + "grad_norm": 2.0450207414018253, + "learning_rate": 3.986203433465774e-07, + "loss": 0.3033, + "step": 6245 + }, + { + "epoch": 0.8760168302945301, + "grad_norm": 2.1962849373988607, + "learning_rate": 3.9773214622669974e-07, + "loss": 0.3606, + "step": 6246 + }, + { + "epoch": 0.8761570827489481, + "grad_norm": 1.925182442875088, + "learning_rate": 3.968448987526391e-07, + "loss": 0.297, + "step": 6247 + }, + { + "epoch": 0.8762973352033661, + "grad_norm": 3.0837439358160283, + "learning_rate": 3.959586011074729e-07, + "loss": 0.4101, + "step": 6248 + }, + { + "epoch": 0.876437587657784, + "grad_norm": 1.820047121800348, + "learning_rate": 3.9507325347408365e-07, + "loss": 0.3715, + "step": 6249 + }, + { + "epoch": 0.876577840112202, + "grad_norm": 2.123989538163392, + "learning_rate": 3.9418885603515535e-07, + "loss": 0.3285, + "step": 6250 + }, + { + "epoch": 0.8767180925666199, + "grad_norm": 1.9028326577334098, + "learning_rate": 3.9330540897317805e-07, + "loss": 0.3282, + "step": 6251 + }, + { + "epoch": 0.8768583450210379, + "grad_norm": 2.5625067920129596, + "learning_rate": 3.9242291247044484e-07, + "loss": 0.3355, + "step": 6252 + }, + { + "epoch": 0.8769985974754558, + "grad_norm": 1.9377393267084986, + "learning_rate": 3.9154136670905287e-07, + "loss": 0.3043, + "step": 6253 + }, + { + "epoch": 0.8771388499298738, + "grad_norm": 2.133249091432173, + "learning_rate": 3.9066077187090215e-07, + "loss": 0.3595, + "step": 6254 + }, + { + "epoch": 0.8772791023842917, + "grad_norm": 8.592033297994583, + "learning_rate": 3.8978112813769786e-07, + "loss": 0.2706, + "step": 6255 + }, + { + "epoch": 0.8774193548387097, + "grad_norm": 1.7053572698235018, + "learning_rate": 3.8890243569094874e-07, + "loss": 0.331, + "step": 6256 + }, + { + "epoch": 0.8775596072931277, + "grad_norm": 1.7764867707738832, + "learning_rate": 3.880246947119659e-07, + "loss": 0.4069, + "step": 6257 + }, + { + "epoch": 0.8776998597475456, + "grad_norm": 2.1733428605383747, + "learning_rate": 3.8714790538186553e-07, + "loss": 0.3234, + "step": 6258 + }, + { + "epoch": 0.8778401122019636, + "grad_norm": 2.723296912411615, + "learning_rate": 3.862720678815668e-07, + "loss": 0.3723, + "step": 6259 + }, + { + "epoch": 0.8779803646563815, + "grad_norm": 1.9761420846968247, + "learning_rate": 3.853971823917929e-07, + "loss": 0.3464, + "step": 6260 + }, + { + "epoch": 0.8781206171107995, + "grad_norm": 4.592249618790261, + "learning_rate": 3.845232490930706e-07, + "loss": 0.3224, + "step": 6261 + }, + { + "epoch": 0.8782608695652174, + "grad_norm": 1.7811874914076617, + "learning_rate": 3.836502681657289e-07, + "loss": 0.3361, + "step": 6262 + }, + { + "epoch": 0.8784011220196354, + "grad_norm": 1.7916467650735006, + "learning_rate": 3.827782397899021e-07, + "loss": 0.3057, + "step": 6263 + }, + { + "epoch": 0.8785413744740533, + "grad_norm": 1.819111571142397, + "learning_rate": 3.819071641455274e-07, + "loss": 0.336, + "step": 6264 + }, + { + "epoch": 0.8786816269284713, + "grad_norm": 2.0791107363876984, + "learning_rate": 3.810370414123454e-07, + "loss": 0.3953, + "step": 6265 + }, + { + "epoch": 0.8788218793828892, + "grad_norm": 2.1047932916120384, + "learning_rate": 3.801678717698987e-07, + "loss": 0.2798, + "step": 6266 + }, + { + "epoch": 0.8789621318373072, + "grad_norm": 1.8515435712788382, + "learning_rate": 3.792996553975359e-07, + "loss": 0.3328, + "step": 6267 + }, + { + "epoch": 0.8791023842917252, + "grad_norm": 1.5640742191938888, + "learning_rate": 3.7843239247440545e-07, + "loss": 0.3314, + "step": 6268 + }, + { + "epoch": 0.879242636746143, + "grad_norm": 1.838173401632113, + "learning_rate": 3.7756608317946144e-07, + "loss": 0.3628, + "step": 6269 + }, + { + "epoch": 0.879382889200561, + "grad_norm": 1.8346106746639579, + "learning_rate": 3.767007276914619e-07, + "loss": 0.3167, + "step": 6270 + }, + { + "epoch": 0.8795231416549789, + "grad_norm": 1.5852026528851482, + "learning_rate": 3.7583632618896635e-07, + "loss": 0.3313, + "step": 6271 + }, + { + "epoch": 0.8796633941093969, + "grad_norm": 2.039834296658605, + "learning_rate": 3.7497287885033763e-07, + "loss": 0.3324, + "step": 6272 + }, + { + "epoch": 0.8798036465638148, + "grad_norm": 1.9923923162752146, + "learning_rate": 3.7411038585374206e-07, + "loss": 0.3583, + "step": 6273 + }, + { + "epoch": 0.8799438990182328, + "grad_norm": 2.0758346901052915, + "learning_rate": 3.7324884737715003e-07, + "loss": 0.3352, + "step": 6274 + }, + { + "epoch": 0.8800841514726507, + "grad_norm": 2.649444028724977, + "learning_rate": 3.723882635983328e-07, + "loss": 0.3543, + "step": 6275 + }, + { + "epoch": 0.8802244039270687, + "grad_norm": 1.6356024019096354, + "learning_rate": 3.715286346948671e-07, + "loss": 0.3016, + "step": 6276 + }, + { + "epoch": 0.8803646563814866, + "grad_norm": 2.361852898354247, + "learning_rate": 3.7066996084413e-07, + "loss": 0.3336, + "step": 6277 + }, + { + "epoch": 0.8805049088359046, + "grad_norm": 2.416907550601449, + "learning_rate": 3.698122422233036e-07, + "loss": 0.3203, + "step": 6278 + }, + { + "epoch": 0.8806451612903226, + "grad_norm": 3.276667710462878, + "learning_rate": 3.6895547900937136e-07, + "loss": 0.3193, + "step": 6279 + }, + { + "epoch": 0.8807854137447405, + "grad_norm": 1.992461660631314, + "learning_rate": 3.6809967137912183e-07, + "loss": 0.3555, + "step": 6280 + }, + { + "epoch": 0.8809256661991585, + "grad_norm": 2.0779222661992085, + "learning_rate": 3.6724481950914326e-07, + "loss": 0.3006, + "step": 6281 + }, + { + "epoch": 0.8810659186535764, + "grad_norm": 3.5406671048494918, + "learning_rate": 3.663909235758295e-07, + "loss": 0.3716, + "step": 6282 + }, + { + "epoch": 0.8812061711079944, + "grad_norm": 1.6455470773783436, + "learning_rate": 3.6553798375537574e-07, + "loss": 0.3384, + "step": 6283 + }, + { + "epoch": 0.8813464235624123, + "grad_norm": 2.403501947848549, + "learning_rate": 3.646860002237801e-07, + "loss": 0.353, + "step": 6284 + }, + { + "epoch": 0.8814866760168303, + "grad_norm": 2.6385936259562617, + "learning_rate": 3.638349731568436e-07, + "loss": 0.2969, + "step": 6285 + }, + { + "epoch": 0.8816269284712482, + "grad_norm": 2.313372679310739, + "learning_rate": 3.6298490273017017e-07, + "loss": 0.3529, + "step": 6286 + }, + { + "epoch": 0.8817671809256662, + "grad_norm": 1.7654749713723603, + "learning_rate": 3.621357891191657e-07, + "loss": 0.37, + "step": 6287 + }, + { + "epoch": 0.8819074333800841, + "grad_norm": 2.0446533493182097, + "learning_rate": 3.612876324990372e-07, + "loss": 0.3335, + "step": 6288 + }, + { + "epoch": 0.8820476858345021, + "grad_norm": 1.959427210159347, + "learning_rate": 3.6044043304479745e-07, + "loss": 0.328, + "step": 6289 + }, + { + "epoch": 0.8821879382889201, + "grad_norm": 1.581465707171352, + "learning_rate": 3.595941909312595e-07, + "loss": 0.3288, + "step": 6290 + }, + { + "epoch": 0.882328190743338, + "grad_norm": 2.754262135553992, + "learning_rate": 3.587489063330402e-07, + "loss": 0.3232, + "step": 6291 + }, + { + "epoch": 0.882468443197756, + "grad_norm": 2.337862580273782, + "learning_rate": 3.5790457942455725e-07, + "loss": 0.3691, + "step": 6292 + }, + { + "epoch": 0.8826086956521739, + "grad_norm": 2.1880469592074623, + "learning_rate": 3.570612103800325e-07, + "loss": 0.3031, + "step": 6293 + }, + { + "epoch": 0.8827489481065919, + "grad_norm": 1.8746505411647414, + "learning_rate": 3.5621879937348836e-07, + "loss": 0.308, + "step": 6294 + }, + { + "epoch": 0.8828892005610098, + "grad_norm": 2.230027992004426, + "learning_rate": 3.5537734657875136e-07, + "loss": 0.3336, + "step": 6295 + }, + { + "epoch": 0.8830294530154278, + "grad_norm": 1.757850341389559, + "learning_rate": 3.545368521694487e-07, + "loss": 0.2911, + "step": 6296 + }, + { + "epoch": 0.8831697054698457, + "grad_norm": 2.000659833781798, + "learning_rate": 3.5369731631901214e-07, + "loss": 0.3766, + "step": 6297 + }, + { + "epoch": 0.8833099579242637, + "grad_norm": 2.2010871715207694, + "learning_rate": 3.528587392006716e-07, + "loss": 0.3526, + "step": 6298 + }, + { + "epoch": 0.8834502103786817, + "grad_norm": 1.734279045681108, + "learning_rate": 3.520211209874624e-07, + "loss": 0.2763, + "step": 6299 + }, + { + "epoch": 0.8835904628330996, + "grad_norm": 1.7584538624871486, + "learning_rate": 3.51184461852222e-07, + "loss": 0.3872, + "step": 6300 + }, + { + "epoch": 0.8837307152875176, + "grad_norm": 1.9306525778537398, + "learning_rate": 3.5034876196758825e-07, + "loss": 0.3531, + "step": 6301 + }, + { + "epoch": 0.8838709677419355, + "grad_norm": 2.088028220771579, + "learning_rate": 3.4951402150600275e-07, + "loss": 0.3196, + "step": 6302 + }, + { + "epoch": 0.8840112201963535, + "grad_norm": 2.7617141745177354, + "learning_rate": 3.486802406397083e-07, + "loss": 0.3291, + "step": 6303 + }, + { + "epoch": 0.8841514726507714, + "grad_norm": 1.826882094722809, + "learning_rate": 3.4784741954074884e-07, + "loss": 0.2972, + "step": 6304 + }, + { + "epoch": 0.8842917251051894, + "grad_norm": 1.9117986876716981, + "learning_rate": 3.470155583809726e-07, + "loss": 0.2747, + "step": 6305 + }, + { + "epoch": 0.8844319775596073, + "grad_norm": 2.161411575975973, + "learning_rate": 3.4618465733202765e-07, + "loss": 0.3381, + "step": 6306 + }, + { + "epoch": 0.8845722300140253, + "grad_norm": 2.6001967319005272, + "learning_rate": 3.453547165653642e-07, + "loss": 0.3306, + "step": 6307 + }, + { + "epoch": 0.8847124824684433, + "grad_norm": 1.8931393888473897, + "learning_rate": 3.4452573625223584e-07, + "loss": 0.3835, + "step": 6308 + }, + { + "epoch": 0.8848527349228611, + "grad_norm": 1.8680725553827058, + "learning_rate": 3.436977165636951e-07, + "loss": 0.3033, + "step": 6309 + }, + { + "epoch": 0.884992987377279, + "grad_norm": 2.0719808514052653, + "learning_rate": 3.428706576705992e-07, + "loss": 0.3403, + "step": 6310 + }, + { + "epoch": 0.885133239831697, + "grad_norm": 1.6961861745567095, + "learning_rate": 3.420445597436056e-07, + "loss": 0.3213, + "step": 6311 + }, + { + "epoch": 0.885273492286115, + "grad_norm": 1.7356935592369231, + "learning_rate": 3.41219422953174e-07, + "loss": 0.37, + "step": 6312 + }, + { + "epoch": 0.8854137447405329, + "grad_norm": 2.298424956846988, + "learning_rate": 3.4039524746956597e-07, + "loss": 0.3029, + "step": 6313 + }, + { + "epoch": 0.8855539971949509, + "grad_norm": 1.7647260554880972, + "learning_rate": 3.395720334628438e-07, + "loss": 0.3174, + "step": 6314 + }, + { + "epoch": 0.8856942496493688, + "grad_norm": 3.1359185781321424, + "learning_rate": 3.3874978110287224e-07, + "loss": 0.393, + "step": 6315 + }, + { + "epoch": 0.8858345021037868, + "grad_norm": 1.85676417053083, + "learning_rate": 3.3792849055931776e-07, + "loss": 0.341, + "step": 6316 + }, + { + "epoch": 0.8859747545582047, + "grad_norm": 1.834654924132413, + "learning_rate": 3.371081620016475e-07, + "loss": 0.337, + "step": 6317 + }, + { + "epoch": 0.8861150070126227, + "grad_norm": 3.3153277149726867, + "learning_rate": 3.362887955991301e-07, + "loss": 0.347, + "step": 6318 + }, + { + "epoch": 0.8862552594670406, + "grad_norm": 2.264503464493783, + "learning_rate": 3.354703915208363e-07, + "loss": 0.295, + "step": 6319 + }, + { + "epoch": 0.8863955119214586, + "grad_norm": 2.0317260184418746, + "learning_rate": 3.3465294993563826e-07, + "loss": 0.3375, + "step": 6320 + }, + { + "epoch": 0.8865357643758766, + "grad_norm": 1.9126458355520601, + "learning_rate": 3.338364710122094e-07, + "loss": 0.3271, + "step": 6321 + }, + { + "epoch": 0.8866760168302945, + "grad_norm": 1.96007388121654, + "learning_rate": 3.330209549190244e-07, + "loss": 0.3763, + "step": 6322 + }, + { + "epoch": 0.8868162692847125, + "grad_norm": 1.850327864797175, + "learning_rate": 3.322064018243587e-07, + "loss": 0.3581, + "step": 6323 + }, + { + "epoch": 0.8869565217391304, + "grad_norm": 1.8820289469285605, + "learning_rate": 3.313928118962906e-07, + "loss": 0.329, + "step": 6324 + }, + { + "epoch": 0.8870967741935484, + "grad_norm": 1.7805049074057675, + "learning_rate": 3.305801853026985e-07, + "loss": 0.2933, + "step": 6325 + }, + { + "epoch": 0.8872370266479663, + "grad_norm": 1.6507930468143621, + "learning_rate": 3.297685222112623e-07, + "loss": 0.2983, + "step": 6326 + }, + { + "epoch": 0.8873772791023843, + "grad_norm": 1.8365418433727847, + "learning_rate": 3.2895782278946244e-07, + "loss": 0.3528, + "step": 6327 + }, + { + "epoch": 0.8875175315568022, + "grad_norm": 1.8484968084009057, + "learning_rate": 3.2814808720458226e-07, + "loss": 0.3511, + "step": 6328 + }, + { + "epoch": 0.8876577840112202, + "grad_norm": 2.2172339189225623, + "learning_rate": 3.2733931562370257e-07, + "loss": 0.3376, + "step": 6329 + }, + { + "epoch": 0.8877980364656382, + "grad_norm": 1.805411519720349, + "learning_rate": 3.265315082137099e-07, + "loss": 0.2986, + "step": 6330 + }, + { + "epoch": 0.8879382889200561, + "grad_norm": 1.893011242385263, + "learning_rate": 3.2572466514128876e-07, + "loss": 0.3043, + "step": 6331 + }, + { + "epoch": 0.8880785413744741, + "grad_norm": 2.4741823329238963, + "learning_rate": 3.2491878657292643e-07, + "loss": 0.3805, + "step": 6332 + }, + { + "epoch": 0.888218793828892, + "grad_norm": 2.211128361420085, + "learning_rate": 3.2411387267490937e-07, + "loss": 0.327, + "step": 6333 + }, + { + "epoch": 0.88835904628331, + "grad_norm": 1.8169338977611755, + "learning_rate": 3.233099236133264e-07, + "loss": 0.2745, + "step": 6334 + }, + { + "epoch": 0.8884992987377279, + "grad_norm": 2.1614977239366913, + "learning_rate": 3.2250693955406697e-07, + "loss": 0.3514, + "step": 6335 + }, + { + "epoch": 0.8886395511921459, + "grad_norm": 5.061884901567189, + "learning_rate": 3.217049206628209e-07, + "loss": 0.3363, + "step": 6336 + }, + { + "epoch": 0.8887798036465638, + "grad_norm": 5.0219397796459715, + "learning_rate": 3.2090386710507906e-07, + "loss": 0.3207, + "step": 6337 + }, + { + "epoch": 0.8889200561009818, + "grad_norm": 2.380255055264623, + "learning_rate": 3.201037790461342e-07, + "loss": 0.3436, + "step": 6338 + }, + { + "epoch": 0.8890603085553997, + "grad_norm": 2.099317521503431, + "learning_rate": 3.193046566510777e-07, + "loss": 0.3419, + "step": 6339 + }, + { + "epoch": 0.8892005610098177, + "grad_norm": 4.856192868558223, + "learning_rate": 3.185065000848031e-07, + "loss": 0.3244, + "step": 6340 + }, + { + "epoch": 0.8893408134642357, + "grad_norm": 1.8512882077783777, + "learning_rate": 3.1770930951200483e-07, + "loss": 0.3461, + "step": 6341 + }, + { + "epoch": 0.8894810659186536, + "grad_norm": 1.7256280778403572, + "learning_rate": 3.16913085097178e-07, + "loss": 0.3362, + "step": 6342 + }, + { + "epoch": 0.8896213183730716, + "grad_norm": 2.257556412373256, + "learning_rate": 3.161178270046167e-07, + "loss": 0.3812, + "step": 6343 + }, + { + "epoch": 0.8897615708274895, + "grad_norm": 1.942270619953052, + "learning_rate": 3.15323535398418e-07, + "loss": 0.3305, + "step": 6344 + }, + { + "epoch": 0.8899018232819075, + "grad_norm": 2.513542118591529, + "learning_rate": 3.14530210442478e-07, + "loss": 0.3796, + "step": 6345 + }, + { + "epoch": 0.8900420757363254, + "grad_norm": 2.9497962234720156, + "learning_rate": 3.1373785230049356e-07, + "loss": 0.3605, + "step": 6346 + }, + { + "epoch": 0.8901823281907434, + "grad_norm": 2.4746160228612855, + "learning_rate": 3.129464611359634e-07, + "loss": 0.3978, + "step": 6347 + }, + { + "epoch": 0.8903225806451613, + "grad_norm": 2.0952153558137048, + "learning_rate": 3.12156037112184e-07, + "loss": 0.3059, + "step": 6348 + }, + { + "epoch": 0.8904628330995792, + "grad_norm": 1.708006556201296, + "learning_rate": 3.1136658039225497e-07, + "loss": 0.3166, + "step": 6349 + }, + { + "epoch": 0.8906030855539971, + "grad_norm": 1.800262575071448, + "learning_rate": 3.105780911390738e-07, + "loss": 0.3589, + "step": 6350 + }, + { + "epoch": 0.8907433380084151, + "grad_norm": 1.7566654786329698, + "learning_rate": 3.097905695153408e-07, + "loss": 0.2929, + "step": 6351 + }, + { + "epoch": 0.890883590462833, + "grad_norm": 1.8035065721561834, + "learning_rate": 3.090040156835555e-07, + "loss": 0.3514, + "step": 6352 + }, + { + "epoch": 0.891023842917251, + "grad_norm": 1.7010293368725664, + "learning_rate": 3.0821842980601756e-07, + "loss": 0.3043, + "step": 6353 + }, + { + "epoch": 0.891164095371669, + "grad_norm": 1.7516822903290543, + "learning_rate": 3.0743381204482726e-07, + "loss": 0.3359, + "step": 6354 + }, + { + "epoch": 0.8913043478260869, + "grad_norm": 1.7836326591535243, + "learning_rate": 3.066501625618851e-07, + "loss": 0.3246, + "step": 6355 + }, + { + "epoch": 0.8914446002805049, + "grad_norm": 1.7662744593563413, + "learning_rate": 3.058674815188917e-07, + "loss": 0.3267, + "step": 6356 + }, + { + "epoch": 0.8915848527349228, + "grad_norm": 2.1919827986272016, + "learning_rate": 3.0508576907734734e-07, + "loss": 0.3202, + "step": 6357 + }, + { + "epoch": 0.8917251051893408, + "grad_norm": 2.9177214940319165, + "learning_rate": 3.043050253985541e-07, + "loss": 0.3252, + "step": 6358 + }, + { + "epoch": 0.8918653576437587, + "grad_norm": 1.811432754426676, + "learning_rate": 3.0352525064361147e-07, + "loss": 0.3388, + "step": 6359 + }, + { + "epoch": 0.8920056100981767, + "grad_norm": 1.736775635591189, + "learning_rate": 3.0274644497342133e-07, + "loss": 0.3246, + "step": 6360 + }, + { + "epoch": 0.8921458625525946, + "grad_norm": 1.8262877402038598, + "learning_rate": 3.0196860854868447e-07, + "loss": 0.2895, + "step": 6361 + }, + { + "epoch": 0.8922861150070126, + "grad_norm": 1.7191176974850781, + "learning_rate": 3.0119174152990204e-07, + "loss": 0.3255, + "step": 6362 + }, + { + "epoch": 0.8924263674614306, + "grad_norm": 1.9905385092924774, + "learning_rate": 3.0041584407737577e-07, + "loss": 0.3416, + "step": 6363 + }, + { + "epoch": 0.8925666199158485, + "grad_norm": 2.838382392171113, + "learning_rate": 2.996409163512054e-07, + "loss": 0.2938, + "step": 6364 + }, + { + "epoch": 0.8927068723702665, + "grad_norm": 1.7577040575273946, + "learning_rate": 2.9886695851129297e-07, + "loss": 0.3255, + "step": 6365 + }, + { + "epoch": 0.8928471248246844, + "grad_norm": 1.705563680371963, + "learning_rate": 2.980939707173391e-07, + "loss": 0.3891, + "step": 6366 + }, + { + "epoch": 0.8929873772791024, + "grad_norm": 2.20109670353286, + "learning_rate": 2.9732195312884515e-07, + "loss": 0.3688, + "step": 6367 + }, + { + "epoch": 0.8931276297335203, + "grad_norm": 2.0139455970000237, + "learning_rate": 2.965509059051097e-07, + "loss": 0.3176, + "step": 6368 + }, + { + "epoch": 0.8932678821879383, + "grad_norm": 1.6841023466854168, + "learning_rate": 2.9578082920523387e-07, + "loss": 0.3734, + "step": 6369 + }, + { + "epoch": 0.8934081346423562, + "grad_norm": 2.105283824605893, + "learning_rate": 2.9501172318811834e-07, + "loss": 0.3286, + "step": 6370 + }, + { + "epoch": 0.8935483870967742, + "grad_norm": 2.508199026202168, + "learning_rate": 2.9424358801246167e-07, + "loss": 0.3455, + "step": 6371 + }, + { + "epoch": 0.8936886395511922, + "grad_norm": 1.7060103167750753, + "learning_rate": 2.934764238367632e-07, + "loss": 0.3388, + "step": 6372 + }, + { + "epoch": 0.8938288920056101, + "grad_norm": 2.207762794448052, + "learning_rate": 2.927102308193225e-07, + "loss": 0.3049, + "step": 6373 + }, + { + "epoch": 0.8939691444600281, + "grad_norm": 2.377933296968513, + "learning_rate": 2.91945009118238e-07, + "loss": 0.3223, + "step": 6374 + }, + { + "epoch": 0.894109396914446, + "grad_norm": 2.134356264895068, + "learning_rate": 2.911807588914078e-07, + "loss": 0.323, + "step": 6375 + }, + { + "epoch": 0.894249649368864, + "grad_norm": 3.4754981308274724, + "learning_rate": 2.904174802965293e-07, + "loss": 0.3468, + "step": 6376 + }, + { + "epoch": 0.8943899018232819, + "grad_norm": 1.6229265629764944, + "learning_rate": 2.8965517349110015e-07, + "loss": 0.3615, + "step": 6377 + }, + { + "epoch": 0.8945301542776999, + "grad_norm": 1.7426960976812025, + "learning_rate": 2.888938386324169e-07, + "loss": 0.3728, + "step": 6378 + }, + { + "epoch": 0.8946704067321178, + "grad_norm": 2.5721471667982874, + "learning_rate": 2.8813347587757667e-07, + "loss": 0.3275, + "step": 6379 + }, + { + "epoch": 0.8948106591865358, + "grad_norm": 7.014666859335583, + "learning_rate": 2.873740853834728e-07, + "loss": 0.364, + "step": 6380 + }, + { + "epoch": 0.8949509116409538, + "grad_norm": 1.8831372119546843, + "learning_rate": 2.866156673068016e-07, + "loss": 0.2918, + "step": 6381 + }, + { + "epoch": 0.8950911640953717, + "grad_norm": 1.6226080382691666, + "learning_rate": 2.858582218040573e-07, + "loss": 0.3077, + "step": 6382 + }, + { + "epoch": 0.8952314165497897, + "grad_norm": 1.7182186345701416, + "learning_rate": 2.851017490315333e-07, + "loss": 0.3249, + "step": 6383 + }, + { + "epoch": 0.8953716690042076, + "grad_norm": 1.6869435982538288, + "learning_rate": 2.843462491453219e-07, + "loss": 0.3067, + "step": 6384 + }, + { + "epoch": 0.8955119214586256, + "grad_norm": 2.1809606628018785, + "learning_rate": 2.8359172230131626e-07, + "loss": 0.3509, + "step": 6385 + }, + { + "epoch": 0.8956521739130435, + "grad_norm": 2.210615909763722, + "learning_rate": 2.828381686552073e-07, + "loss": 0.3623, + "step": 6386 + }, + { + "epoch": 0.8957924263674615, + "grad_norm": 1.5366265242595494, + "learning_rate": 2.820855883624857e-07, + "loss": 0.3078, + "step": 6387 + }, + { + "epoch": 0.8959326788218794, + "grad_norm": 1.74725473879402, + "learning_rate": 2.813339815784416e-07, + "loss": 0.3224, + "step": 6388 + }, + { + "epoch": 0.8960729312762973, + "grad_norm": 2.6935483604778945, + "learning_rate": 2.8058334845816214e-07, + "loss": 0.3646, + "step": 6389 + }, + { + "epoch": 0.8962131837307152, + "grad_norm": 1.9151870226856185, + "learning_rate": 2.7983368915653674e-07, + "loss": 0.2947, + "step": 6390 + }, + { + "epoch": 0.8963534361851332, + "grad_norm": 1.7127801039565111, + "learning_rate": 2.790850038282522e-07, + "loss": 0.3424, + "step": 6391 + }, + { + "epoch": 0.8964936886395511, + "grad_norm": 2.3244892379698694, + "learning_rate": 2.7833729262779383e-07, + "loss": 0.3343, + "step": 6392 + }, + { + "epoch": 0.8966339410939691, + "grad_norm": 3.043613037537069, + "learning_rate": 2.7759055570944715e-07, + "loss": 0.3596, + "step": 6393 + }, + { + "epoch": 0.896774193548387, + "grad_norm": 1.659154971490423, + "learning_rate": 2.768447932272955e-07, + "loss": 0.3176, + "step": 6394 + }, + { + "epoch": 0.896914446002805, + "grad_norm": 1.9829444835315149, + "learning_rate": 2.76100005335222e-07, + "loss": 0.2822, + "step": 6395 + }, + { + "epoch": 0.897054698457223, + "grad_norm": 2.3087549556726494, + "learning_rate": 2.753561921869091e-07, + "loss": 0.3237, + "step": 6396 + }, + { + "epoch": 0.8971949509116409, + "grad_norm": 2.4650936816135123, + "learning_rate": 2.74613353935837e-07, + "loss": 0.3091, + "step": 6397 + }, + { + "epoch": 0.8973352033660589, + "grad_norm": 2.315761019563859, + "learning_rate": 2.7387149073528464e-07, + "loss": 0.3411, + "step": 6398 + }, + { + "epoch": 0.8974754558204768, + "grad_norm": 1.9166401225477205, + "learning_rate": 2.731306027383318e-07, + "loss": 0.336, + "step": 6399 + }, + { + "epoch": 0.8976157082748948, + "grad_norm": 2.5515846751383022, + "learning_rate": 2.72390690097854e-07, + "loss": 0.3392, + "step": 6400 + }, + { + "epoch": 0.8977559607293127, + "grad_norm": 1.8463056277273768, + "learning_rate": 2.7165175296652746e-07, + "loss": 0.3794, + "step": 6401 + }, + { + "epoch": 0.8978962131837307, + "grad_norm": 2.2725232570284843, + "learning_rate": 2.7091379149682683e-07, + "loss": 0.3407, + "step": 6402 + }, + { + "epoch": 0.8980364656381487, + "grad_norm": 1.8793436600026, + "learning_rate": 2.7017680584102537e-07, + "loss": 0.3409, + "step": 6403 + }, + { + "epoch": 0.8981767180925666, + "grad_norm": 2.0360867941067706, + "learning_rate": 2.694407961511947e-07, + "loss": 0.319, + "step": 6404 + }, + { + "epoch": 0.8983169705469846, + "grad_norm": 2.130367456538409, + "learning_rate": 2.6870576257920553e-07, + "loss": 0.3355, + "step": 6405 + }, + { + "epoch": 0.8984572230014025, + "grad_norm": 1.8509436108219248, + "learning_rate": 2.6797170527672723e-07, + "loss": 0.3405, + "step": 6406 + }, + { + "epoch": 0.8985974754558205, + "grad_norm": 2.442118841317848, + "learning_rate": 2.672386243952263e-07, + "loss": 0.362, + "step": 6407 + }, + { + "epoch": 0.8987377279102384, + "grad_norm": 2.011796020208985, + "learning_rate": 2.6650652008597067e-07, + "loss": 0.3043, + "step": 6408 + }, + { + "epoch": 0.8988779803646564, + "grad_norm": 1.8372185316350906, + "learning_rate": 2.657753925000228e-07, + "loss": 0.3341, + "step": 6409 + }, + { + "epoch": 0.8990182328190743, + "grad_norm": 1.5716785922496417, + "learning_rate": 2.6504524178824706e-07, + "loss": 0.3359, + "step": 6410 + }, + { + "epoch": 0.8991584852734923, + "grad_norm": 1.8070496132293217, + "learning_rate": 2.643160681013046e-07, + "loss": 0.3408, + "step": 6411 + }, + { + "epoch": 0.8992987377279102, + "grad_norm": 1.5685709695371164, + "learning_rate": 2.6358787158965616e-07, + "loss": 0.3508, + "step": 6412 + }, + { + "epoch": 0.8994389901823282, + "grad_norm": 2.749902896900691, + "learning_rate": 2.628606524035582e-07, + "loss": 0.3598, + "step": 6413 + }, + { + "epoch": 0.8995792426367462, + "grad_norm": 1.6787762676351194, + "learning_rate": 2.62134410693069e-07, + "loss": 0.2974, + "step": 6414 + }, + { + "epoch": 0.8997194950911641, + "grad_norm": 2.3596508546958583, + "learning_rate": 2.6140914660804205e-07, + "loss": 0.3483, + "step": 6415 + }, + { + "epoch": 0.8998597475455821, + "grad_norm": 1.7489755901903796, + "learning_rate": 2.6068486029813154e-07, + "loss": 0.3472, + "step": 6416 + }, + { + "epoch": 0.9, + "grad_norm": 3.43849764131923, + "learning_rate": 2.599615519127885e-07, + "loss": 0.3005, + "step": 6417 + }, + { + "epoch": 0.900140252454418, + "grad_norm": 2.081259636384117, + "learning_rate": 2.592392216012629e-07, + "loss": 0.3582, + "step": 6418 + }, + { + "epoch": 0.9002805049088359, + "grad_norm": 2.6991352780542623, + "learning_rate": 2.585178695126023e-07, + "loss": 0.3901, + "step": 6419 + }, + { + "epoch": 0.9004207573632539, + "grad_norm": 1.8602617857889976, + "learning_rate": 2.577974957956536e-07, + "loss": 0.35, + "step": 6420 + }, + { + "epoch": 0.9005610098176718, + "grad_norm": 1.876405443516473, + "learning_rate": 2.5707810059905914e-07, + "loss": 0.3109, + "step": 6421 + }, + { + "epoch": 0.9007012622720898, + "grad_norm": 1.7687461071173758, + "learning_rate": 2.5635968407126175e-07, + "loss": 0.364, + "step": 6422 + }, + { + "epoch": 0.9008415147265078, + "grad_norm": 1.875204456626101, + "learning_rate": 2.556422463605024e-07, + "loss": 0.3147, + "step": 6423 + }, + { + "epoch": 0.9009817671809257, + "grad_norm": 2.2041094589260966, + "learning_rate": 2.549257876148181e-07, + "loss": 0.3424, + "step": 6424 + }, + { + "epoch": 0.9011220196353437, + "grad_norm": 3.062173164210323, + "learning_rate": 2.542103079820463e-07, + "loss": 0.3089, + "step": 6425 + }, + { + "epoch": 0.9012622720897616, + "grad_norm": 1.8930857384112518, + "learning_rate": 2.534958076098204e-07, + "loss": 0.3855, + "step": 6426 + }, + { + "epoch": 0.9014025245441796, + "grad_norm": 2.0237503104536585, + "learning_rate": 2.5278228664557315e-07, + "loss": 0.2977, + "step": 6427 + }, + { + "epoch": 0.9015427769985975, + "grad_norm": 2.8737569692559544, + "learning_rate": 2.520697452365345e-07, + "loss": 0.3242, + "step": 6428 + }, + { + "epoch": 0.9016830294530155, + "grad_norm": 2.060874298260323, + "learning_rate": 2.513581835297324e-07, + "loss": 0.3887, + "step": 6429 + }, + { + "epoch": 0.9018232819074333, + "grad_norm": 4.4434439380689525, + "learning_rate": 2.506476016719922e-07, + "loss": 0.3178, + "step": 6430 + }, + { + "epoch": 0.9019635343618513, + "grad_norm": 2.0877908195714006, + "learning_rate": 2.499379998099377e-07, + "loss": 0.357, + "step": 6431 + }, + { + "epoch": 0.9021037868162692, + "grad_norm": 1.9384758245433105, + "learning_rate": 2.492293780899907e-07, + "loss": 0.3236, + "step": 6432 + }, + { + "epoch": 0.9022440392706872, + "grad_norm": 2.668964697177363, + "learning_rate": 2.4852173665837034e-07, + "loss": 0.3664, + "step": 6433 + }, + { + "epoch": 0.9023842917251051, + "grad_norm": 1.869608885208811, + "learning_rate": 2.478150756610925e-07, + "loss": 0.321, + "step": 6434 + }, + { + "epoch": 0.9025245441795231, + "grad_norm": 2.1069577530661516, + "learning_rate": 2.4710939524397235e-07, + "loss": 0.3268, + "step": 6435 + }, + { + "epoch": 0.9026647966339411, + "grad_norm": 1.9612765683963285, + "learning_rate": 2.4640469555262226e-07, + "loss": 0.3717, + "step": 6436 + }, + { + "epoch": 0.902805049088359, + "grad_norm": 2.0318256442817595, + "learning_rate": 2.4570097673245197e-07, + "loss": 0.3759, + "step": 6437 + }, + { + "epoch": 0.902945301542777, + "grad_norm": 1.7072184654247193, + "learning_rate": 2.4499823892866924e-07, + "loss": 0.3135, + "step": 6438 + }, + { + "epoch": 0.9030855539971949, + "grad_norm": 2.2673687768416793, + "learning_rate": 2.442964822862781e-07, + "loss": 0.3281, + "step": 6439 + }, + { + "epoch": 0.9032258064516129, + "grad_norm": 2.159538415226643, + "learning_rate": 2.4359570695008327e-07, + "loss": 0.3629, + "step": 6440 + }, + { + "epoch": 0.9033660589060308, + "grad_norm": 1.7782799571117813, + "learning_rate": 2.4289591306468244e-07, + "loss": 0.2931, + "step": 6441 + }, + { + "epoch": 0.9035063113604488, + "grad_norm": 2.531211475973964, + "learning_rate": 2.4219710077447446e-07, + "loss": 0.3785, + "step": 6442 + }, + { + "epoch": 0.9036465638148667, + "grad_norm": 1.6680236378556847, + "learning_rate": 2.4149927022365406e-07, + "loss": 0.3628, + "step": 6443 + }, + { + "epoch": 0.9037868162692847, + "grad_norm": 1.8115116581700923, + "learning_rate": 2.4080242155621327e-07, + "loss": 0.3502, + "step": 6444 + }, + { + "epoch": 0.9039270687237027, + "grad_norm": 3.7752446177470573, + "learning_rate": 2.401065549159426e-07, + "loss": 0.3296, + "step": 6445 + }, + { + "epoch": 0.9040673211781206, + "grad_norm": 2.0571705714130015, + "learning_rate": 2.394116704464294e-07, + "loss": 0.319, + "step": 6446 + }, + { + "epoch": 0.9042075736325386, + "grad_norm": 1.8337153845850283, + "learning_rate": 2.387177682910574e-07, + "loss": 0.3131, + "step": 6447 + }, + { + "epoch": 0.9043478260869565, + "grad_norm": 1.898020380608137, + "learning_rate": 2.3802484859300922e-07, + "loss": 0.3488, + "step": 6448 + }, + { + "epoch": 0.9044880785413745, + "grad_norm": 2.1345956356010043, + "learning_rate": 2.3733291149526495e-07, + "loss": 0.3383, + "step": 6449 + }, + { + "epoch": 0.9046283309957924, + "grad_norm": 1.8144939898435781, + "learning_rate": 2.366419571405981e-07, + "loss": 0.3612, + "step": 6450 + }, + { + "epoch": 0.9047685834502104, + "grad_norm": 1.9396023572841739, + "learning_rate": 2.3595198567158473e-07, + "loss": 0.37, + "step": 6451 + }, + { + "epoch": 0.9049088359046283, + "grad_norm": 1.9392519382645634, + "learning_rate": 2.352629972305942e-07, + "loss": 0.3211, + "step": 6452 + }, + { + "epoch": 0.9050490883590463, + "grad_norm": 1.7169912876981206, + "learning_rate": 2.3457499195979616e-07, + "loss": 0.3248, + "step": 6453 + }, + { + "epoch": 0.9051893408134642, + "grad_norm": 1.8409406303932152, + "learning_rate": 2.3388797000115427e-07, + "loss": 0.3319, + "step": 6454 + }, + { + "epoch": 0.9053295932678822, + "grad_norm": 2.7406971242324687, + "learning_rate": 2.3320193149643067e-07, + "loss": 0.3605, + "step": 6455 + }, + { + "epoch": 0.9054698457223002, + "grad_norm": 2.961669026296774, + "learning_rate": 2.325168765871849e-07, + "loss": 0.3153, + "step": 6456 + }, + { + "epoch": 0.9056100981767181, + "grad_norm": 1.6950753744915061, + "learning_rate": 2.318328054147734e-07, + "loss": 0.3574, + "step": 6457 + }, + { + "epoch": 0.9057503506311361, + "grad_norm": 1.9092387548807057, + "learning_rate": 2.3114971812034981e-07, + "loss": 0.3308, + "step": 6458 + }, + { + "epoch": 0.905890603085554, + "grad_norm": 1.7908026881707528, + "learning_rate": 2.304676148448637e-07, + "loss": 0.3726, + "step": 6459 + }, + { + "epoch": 0.906030855539972, + "grad_norm": 2.146073139704114, + "learning_rate": 2.2978649572906298e-07, + "loss": 0.3341, + "step": 6460 + }, + { + "epoch": 0.9061711079943899, + "grad_norm": 2.0290856200945084, + "learning_rate": 2.2910636091349192e-07, + "loss": 0.3498, + "step": 6461 + }, + { + "epoch": 0.9063113604488079, + "grad_norm": 2.0355555752143877, + "learning_rate": 2.2842721053849048e-07, + "loss": 0.3419, + "step": 6462 + }, + { + "epoch": 0.9064516129032258, + "grad_norm": 1.8168796373642095, + "learning_rate": 2.2774904474419768e-07, + "loss": 0.3271, + "step": 6463 + }, + { + "epoch": 0.9065918653576438, + "grad_norm": 2.452611104021504, + "learning_rate": 2.2707186367054767e-07, + "loss": 0.3328, + "step": 6464 + }, + { + "epoch": 0.9067321178120618, + "grad_norm": 2.234727951663096, + "learning_rate": 2.2639566745727203e-07, + "loss": 0.3079, + "step": 6465 + }, + { + "epoch": 0.9068723702664797, + "grad_norm": 1.9857965618938644, + "learning_rate": 2.2572045624389972e-07, + "loss": 0.3555, + "step": 6466 + }, + { + "epoch": 0.9070126227208977, + "grad_norm": 1.5458213223027557, + "learning_rate": 2.2504623016975536e-07, + "loss": 0.3482, + "step": 6467 + }, + { + "epoch": 0.9071528751753156, + "grad_norm": 1.9122035127091819, + "learning_rate": 2.24372989373961e-07, + "loss": 0.3457, + "step": 6468 + }, + { + "epoch": 0.9072931276297336, + "grad_norm": 2.4124207083365934, + "learning_rate": 2.23700733995435e-07, + "loss": 0.2914, + "step": 6469 + }, + { + "epoch": 0.9074333800841514, + "grad_norm": 1.971585789674584, + "learning_rate": 2.2302946417289305e-07, + "loss": 0.418, + "step": 6470 + }, + { + "epoch": 0.9075736325385694, + "grad_norm": 2.1671853789502733, + "learning_rate": 2.223591800448466e-07, + "loss": 0.3333, + "step": 6471 + }, + { + "epoch": 0.9077138849929873, + "grad_norm": 1.8622393304071339, + "learning_rate": 2.2168988174960382e-07, + "loss": 0.3354, + "step": 6472 + }, + { + "epoch": 0.9078541374474053, + "grad_norm": 2.2991299102968035, + "learning_rate": 2.2102156942526986e-07, + "loss": 0.364, + "step": 6473 + }, + { + "epoch": 0.9079943899018232, + "grad_norm": 2.0240534135909893, + "learning_rate": 2.203542432097472e-07, + "loss": 0.3136, + "step": 6474 + }, + { + "epoch": 0.9081346423562412, + "grad_norm": 1.3943806200927298, + "learning_rate": 2.1968790324073285e-07, + "loss": 0.295, + "step": 6475 + }, + { + "epoch": 0.9082748948106592, + "grad_norm": 2.145686493327701, + "learning_rate": 2.1902254965572134e-07, + "loss": 0.362, + "step": 6476 + }, + { + "epoch": 0.9084151472650771, + "grad_norm": 1.9714300896064172, + "learning_rate": 2.1835818259200448e-07, + "loss": 0.3827, + "step": 6477 + }, + { + "epoch": 0.9085553997194951, + "grad_norm": 1.7382667798072229, + "learning_rate": 2.1769480218666927e-07, + "loss": 0.3179, + "step": 6478 + }, + { + "epoch": 0.908695652173913, + "grad_norm": 3.221825620936998, + "learning_rate": 2.1703240857659958e-07, + "loss": 0.3752, + "step": 6479 + }, + { + "epoch": 0.908835904628331, + "grad_norm": 1.9515586981266762, + "learning_rate": 2.163710018984766e-07, + "loss": 0.2827, + "step": 6480 + }, + { + "epoch": 0.9089761570827489, + "grad_norm": 2.067412104554531, + "learning_rate": 2.1571058228877617e-07, + "loss": 0.3329, + "step": 6481 + }, + { + "epoch": 0.9091164095371669, + "grad_norm": 1.4986873287177127, + "learning_rate": 2.1505114988377096e-07, + "loss": 0.288, + "step": 6482 + }, + { + "epoch": 0.9092566619915848, + "grad_norm": 2.3155555127795298, + "learning_rate": 2.14392704819531e-07, + "loss": 0.3225, + "step": 6483 + }, + { + "epoch": 0.9093969144460028, + "grad_norm": 3.6750651035900033, + "learning_rate": 2.137352472319215e-07, + "loss": 0.3276, + "step": 6484 + }, + { + "epoch": 0.9095371669004207, + "grad_norm": 2.05581146944025, + "learning_rate": 2.1307877725660398e-07, + "loss": 0.3108, + "step": 6485 + }, + { + "epoch": 0.9096774193548387, + "grad_norm": 2.0669038169935225, + "learning_rate": 2.124232950290367e-07, + "loss": 0.3097, + "step": 6486 + }, + { + "epoch": 0.9098176718092567, + "grad_norm": 1.8422944740769531, + "learning_rate": 2.117688006844737e-07, + "loss": 0.3132, + "step": 6487 + }, + { + "epoch": 0.9099579242636746, + "grad_norm": 1.784975873319275, + "learning_rate": 2.1111529435796584e-07, + "loss": 0.3492, + "step": 6488 + }, + { + "epoch": 0.9100981767180926, + "grad_norm": 2.1534436525758776, + "learning_rate": 2.104627761843592e-07, + "loss": 0.3334, + "step": 6489 + }, + { + "epoch": 0.9102384291725105, + "grad_norm": 2.2759782679011353, + "learning_rate": 2.0981124629829651e-07, + "loss": 0.3185, + "step": 6490 + }, + { + "epoch": 0.9103786816269285, + "grad_norm": 2.0355962178880844, + "learning_rate": 2.0916070483421592e-07, + "loss": 0.3262, + "step": 6491 + }, + { + "epoch": 0.9105189340813464, + "grad_norm": 2.61705030394832, + "learning_rate": 2.0851115192635218e-07, + "loss": 0.3758, + "step": 6492 + }, + { + "epoch": 0.9106591865357644, + "grad_norm": 2.392020223538441, + "learning_rate": 2.0786258770873647e-07, + "loss": 0.3347, + "step": 6493 + }, + { + "epoch": 0.9107994389901823, + "grad_norm": 1.8840452092247766, + "learning_rate": 2.0721501231519558e-07, + "loss": 0.3383, + "step": 6494 + }, + { + "epoch": 0.9109396914446003, + "grad_norm": 1.5870277903959382, + "learning_rate": 2.065684258793521e-07, + "loss": 0.32, + "step": 6495 + }, + { + "epoch": 0.9110799438990183, + "grad_norm": 1.963483396385638, + "learning_rate": 2.0592282853462377e-07, + "loss": 0.3402, + "step": 6496 + }, + { + "epoch": 0.9112201963534362, + "grad_norm": 1.9601562889223791, + "learning_rate": 2.0527822041422563e-07, + "loss": 0.3095, + "step": 6497 + }, + { + "epoch": 0.9113604488078542, + "grad_norm": 1.6955009390879858, + "learning_rate": 2.04634601651168e-07, + "loss": 0.3416, + "step": 6498 + }, + { + "epoch": 0.9115007012622721, + "grad_norm": 3.94637654374392, + "learning_rate": 2.039919723782574e-07, + "loss": 0.335, + "step": 6499 + }, + { + "epoch": 0.9116409537166901, + "grad_norm": 2.609268460411564, + "learning_rate": 2.0335033272809612e-07, + "loss": 0.3162, + "step": 6500 + }, + { + "epoch": 0.911781206171108, + "grad_norm": 1.8922804843474608, + "learning_rate": 2.0270968283308102e-07, + "loss": 0.3167, + "step": 6501 + }, + { + "epoch": 0.911921458625526, + "grad_norm": 1.866661859687792, + "learning_rate": 2.0207002282540744e-07, + "loss": 0.3533, + "step": 6502 + }, + { + "epoch": 0.9120617110799439, + "grad_norm": 1.711954868176322, + "learning_rate": 2.014313528370626e-07, + "loss": 0.2689, + "step": 6503 + }, + { + "epoch": 0.9122019635343619, + "grad_norm": 2.61727367657796, + "learning_rate": 2.0079367299983276e-07, + "loss": 0.3634, + "step": 6504 + }, + { + "epoch": 0.9123422159887798, + "grad_norm": 1.664704391539879, + "learning_rate": 2.0015698344529877e-07, + "loss": 0.3155, + "step": 6505 + }, + { + "epoch": 0.9124824684431978, + "grad_norm": 1.683728984880033, + "learning_rate": 1.9952128430483718e-07, + "loss": 0.3319, + "step": 6506 + }, + { + "epoch": 0.9126227208976158, + "grad_norm": 2.766460917378963, + "learning_rate": 1.9888657570961924e-07, + "loss": 0.3634, + "step": 6507 + }, + { + "epoch": 0.9127629733520337, + "grad_norm": 2.0925759966869544, + "learning_rate": 1.9825285779061344e-07, + "loss": 0.283, + "step": 6508 + }, + { + "epoch": 0.9129032258064517, + "grad_norm": 2.6910608344032707, + "learning_rate": 1.9762013067858243e-07, + "loss": 0.3418, + "step": 6509 + }, + { + "epoch": 0.9130434782608695, + "grad_norm": 2.8313984341479577, + "learning_rate": 1.9698839450408568e-07, + "loss": 0.3192, + "step": 6510 + }, + { + "epoch": 0.9131837307152875, + "grad_norm": 1.5782042379598649, + "learning_rate": 1.9635764939747782e-07, + "loss": 0.3261, + "step": 6511 + }, + { + "epoch": 0.9133239831697054, + "grad_norm": 1.8616428169264159, + "learning_rate": 1.9572789548890748e-07, + "loss": 0.3028, + "step": 6512 + }, + { + "epoch": 0.9134642356241234, + "grad_norm": 2.494612897760707, + "learning_rate": 1.9509913290832073e-07, + "loss": 0.3678, + "step": 6513 + }, + { + "epoch": 0.9136044880785413, + "grad_norm": 2.3881481029967393, + "learning_rate": 1.9447136178545766e-07, + "loss": 0.3908, + "step": 6514 + }, + { + "epoch": 0.9137447405329593, + "grad_norm": 2.2436378387214755, + "learning_rate": 1.938445822498558e-07, + "loss": 0.3419, + "step": 6515 + }, + { + "epoch": 0.9138849929873772, + "grad_norm": 1.7663563926709596, + "learning_rate": 1.932187944308461e-07, + "loss": 0.3199, + "step": 6516 + }, + { + "epoch": 0.9140252454417952, + "grad_norm": 1.846807434085411, + "learning_rate": 1.925939984575548e-07, + "loss": 0.3448, + "step": 6517 + }, + { + "epoch": 0.9141654978962132, + "grad_norm": 1.7260625469239388, + "learning_rate": 1.919701944589042e-07, + "loss": 0.3109, + "step": 6518 + }, + { + "epoch": 0.9143057503506311, + "grad_norm": 2.143209081086744, + "learning_rate": 1.9134738256361306e-07, + "loss": 0.3181, + "step": 6519 + }, + { + "epoch": 0.9144460028050491, + "grad_norm": 4.028731394744626, + "learning_rate": 1.9072556290019362e-07, + "loss": 0.3243, + "step": 6520 + }, + { + "epoch": 0.914586255259467, + "grad_norm": 1.8834519309564963, + "learning_rate": 1.9010473559695376e-07, + "loss": 0.3135, + "step": 6521 + }, + { + "epoch": 0.914726507713885, + "grad_norm": 1.933139138772827, + "learning_rate": 1.8948490078199767e-07, + "loss": 0.3417, + "step": 6522 + }, + { + "epoch": 0.9148667601683029, + "grad_norm": 2.0987715215982483, + "learning_rate": 1.8886605858322304e-07, + "loss": 0.326, + "step": 6523 + }, + { + "epoch": 0.9150070126227209, + "grad_norm": 2.4846504106753216, + "learning_rate": 1.8824820912832387e-07, + "loss": 0.3308, + "step": 6524 + }, + { + "epoch": 0.9151472650771388, + "grad_norm": 1.9761975219051628, + "learning_rate": 1.8763135254478925e-07, + "loss": 0.303, + "step": 6525 + }, + { + "epoch": 0.9152875175315568, + "grad_norm": 1.8039433650200412, + "learning_rate": 1.8701548895990295e-07, + "loss": 0.3252, + "step": 6526 + }, + { + "epoch": 0.9154277699859747, + "grad_norm": 1.8482777233833467, + "learning_rate": 1.8640061850074443e-07, + "loss": 0.353, + "step": 6527 + }, + { + "epoch": 0.9155680224403927, + "grad_norm": 1.7612766971554672, + "learning_rate": 1.857867412941883e-07, + "loss": 0.3308, + "step": 6528 + }, + { + "epoch": 0.9157082748948107, + "grad_norm": 2.139668494600196, + "learning_rate": 1.8517385746690264e-07, + "loss": 0.34, + "step": 6529 + }, + { + "epoch": 0.9158485273492286, + "grad_norm": 2.7150719855444407, + "learning_rate": 1.8456196714535302e-07, + "loss": 0.3444, + "step": 6530 + }, + { + "epoch": 0.9159887798036466, + "grad_norm": 6.318024889716728, + "learning_rate": 1.839510704557984e-07, + "loss": 0.3432, + "step": 6531 + }, + { + "epoch": 0.9161290322580645, + "grad_norm": 1.4105254559091014, + "learning_rate": 1.8334116752429243e-07, + "loss": 0.3195, + "step": 6532 + }, + { + "epoch": 0.9162692847124825, + "grad_norm": 2.309538001062297, + "learning_rate": 1.8273225847668442e-07, + "loss": 0.3074, + "step": 6533 + }, + { + "epoch": 0.9164095371669004, + "grad_norm": 1.7117698421540013, + "learning_rate": 1.8212434343861886e-07, + "loss": 0.3415, + "step": 6534 + }, + { + "epoch": 0.9165497896213184, + "grad_norm": 2.245402725732361, + "learning_rate": 1.8151742253553483e-07, + "loss": 0.2881, + "step": 6535 + }, + { + "epoch": 0.9166900420757363, + "grad_norm": 1.8490478565589743, + "learning_rate": 1.8091149589266554e-07, + "loss": 0.2994, + "step": 6536 + }, + { + "epoch": 0.9168302945301543, + "grad_norm": 1.8075189218546368, + "learning_rate": 1.8030656363504152e-07, + "loss": 0.2875, + "step": 6537 + }, + { + "epoch": 0.9169705469845723, + "grad_norm": 1.7207027786029376, + "learning_rate": 1.79702625887484e-07, + "loss": 0.3254, + "step": 6538 + }, + { + "epoch": 0.9171107994389902, + "grad_norm": 1.7013413775118333, + "learning_rate": 1.7909968277461276e-07, + "loss": 0.2961, + "step": 6539 + }, + { + "epoch": 0.9172510518934082, + "grad_norm": 2.2446303401840373, + "learning_rate": 1.7849773442084051e-07, + "loss": 0.3078, + "step": 6540 + }, + { + "epoch": 0.9173913043478261, + "grad_norm": 1.7442635795083248, + "learning_rate": 1.7789678095037456e-07, + "loss": 0.3159, + "step": 6541 + }, + { + "epoch": 0.9175315568022441, + "grad_norm": 2.2332372369541384, + "learning_rate": 1.7729682248721848e-07, + "loss": 0.2935, + "step": 6542 + }, + { + "epoch": 0.917671809256662, + "grad_norm": 1.814182158605373, + "learning_rate": 1.7669785915516935e-07, + "loss": 0.3287, + "step": 6543 + }, + { + "epoch": 0.91781206171108, + "grad_norm": 1.8916335135200069, + "learning_rate": 1.7609989107781834e-07, + "loss": 0.3098, + "step": 6544 + }, + { + "epoch": 0.9179523141654979, + "grad_norm": 2.346933263063092, + "learning_rate": 1.7550291837855226e-07, + "loss": 0.3237, + "step": 6545 + }, + { + "epoch": 0.9180925666199159, + "grad_norm": 2.5337828220674905, + "learning_rate": 1.7490694118055263e-07, + "loss": 0.3159, + "step": 6546 + }, + { + "epoch": 0.9182328190743339, + "grad_norm": 1.8358312697823946, + "learning_rate": 1.7431195960679436e-07, + "loss": 0.3344, + "step": 6547 + }, + { + "epoch": 0.9183730715287518, + "grad_norm": 2.047690309243461, + "learning_rate": 1.7371797378004874e-07, + "loss": 0.3044, + "step": 6548 + }, + { + "epoch": 0.9185133239831698, + "grad_norm": 1.8988222587854253, + "learning_rate": 1.731249838228799e-07, + "loss": 0.3778, + "step": 6549 + }, + { + "epoch": 0.9186535764375876, + "grad_norm": 2.363687578059927, + "learning_rate": 1.7253298985764777e-07, + "loss": 0.2884, + "step": 6550 + }, + { + "epoch": 0.9187938288920056, + "grad_norm": 2.0565524032523457, + "learning_rate": 1.7194199200650518e-07, + "loss": 0.3357, + "step": 6551 + }, + { + "epoch": 0.9189340813464235, + "grad_norm": 1.9636810684975385, + "learning_rate": 1.7135199039140239e-07, + "loss": 0.3413, + "step": 6552 + }, + { + "epoch": 0.9190743338008415, + "grad_norm": 1.955582682223935, + "learning_rate": 1.7076298513407973e-07, + "loss": 0.3264, + "step": 6553 + }, + { + "epoch": 0.9192145862552594, + "grad_norm": 1.8403737465008818, + "learning_rate": 1.701749763560756e-07, + "loss": 0.3126, + "step": 6554 + }, + { + "epoch": 0.9193548387096774, + "grad_norm": 2.2001739649915883, + "learning_rate": 1.695879641787207e-07, + "loss": 0.3524, + "step": 6555 + }, + { + "epoch": 0.9194950911640953, + "grad_norm": 1.7826117279647893, + "learning_rate": 1.69001948723142e-07, + "loss": 0.2939, + "step": 6556 + }, + { + "epoch": 0.9196353436185133, + "grad_norm": 1.7222002944097243, + "learning_rate": 1.684169301102595e-07, + "loss": 0.3165, + "step": 6557 + }, + { + "epoch": 0.9197755960729312, + "grad_norm": 1.9297374147453752, + "learning_rate": 1.6783290846078714e-07, + "loss": 0.3266, + "step": 6558 + }, + { + "epoch": 0.9199158485273492, + "grad_norm": 1.7498777210566778, + "learning_rate": 1.6724988389523356e-07, + "loss": 0.3386, + "step": 6559 + }, + { + "epoch": 0.9200561009817672, + "grad_norm": 2.0468470447757037, + "learning_rate": 1.666678565339025e-07, + "loss": 0.3195, + "step": 6560 + }, + { + "epoch": 0.9201963534361851, + "grad_norm": 2.0537156972498933, + "learning_rate": 1.6608682649689068e-07, + "loss": 0.3448, + "step": 6561 + }, + { + "epoch": 0.9203366058906031, + "grad_norm": 2.138053407471259, + "learning_rate": 1.6550679390408998e-07, + "loss": 0.3311, + "step": 6562 + }, + { + "epoch": 0.920476858345021, + "grad_norm": 2.1394554340154373, + "learning_rate": 1.649277588751863e-07, + "loss": 0.3118, + "step": 6563 + }, + { + "epoch": 0.920617110799439, + "grad_norm": 2.02860275307692, + "learning_rate": 1.6434972152965855e-07, + "loss": 0.3199, + "step": 6564 + }, + { + "epoch": 0.9207573632538569, + "grad_norm": 3.0271142491133465, + "learning_rate": 1.6377268198678131e-07, + "loss": 0.3489, + "step": 6565 + }, + { + "epoch": 0.9208976157082749, + "grad_norm": 2.511646228176392, + "learning_rate": 1.6319664036562266e-07, + "loss": 0.3517, + "step": 6566 + }, + { + "epoch": 0.9210378681626928, + "grad_norm": 2.047690192810529, + "learning_rate": 1.6262159678504475e-07, + "loss": 0.3429, + "step": 6567 + }, + { + "epoch": 0.9211781206171108, + "grad_norm": 6.131734143350992, + "learning_rate": 1.620475513637032e-07, + "loss": 0.311, + "step": 6568 + }, + { + "epoch": 0.9213183730715288, + "grad_norm": 1.902329086466522, + "learning_rate": 1.614745042200494e-07, + "loss": 0.3442, + "step": 6569 + }, + { + "epoch": 0.9214586255259467, + "grad_norm": 1.7882286027363872, + "learning_rate": 1.6090245547232707e-07, + "loss": 0.3331, + "step": 6570 + }, + { + "epoch": 0.9215988779803647, + "grad_norm": 1.9829275904952604, + "learning_rate": 1.6033140523857405e-07, + "loss": 0.3329, + "step": 6571 + }, + { + "epoch": 0.9217391304347826, + "grad_norm": 1.646009274828564, + "learning_rate": 1.5976135363662383e-07, + "loss": 0.3373, + "step": 6572 + }, + { + "epoch": 0.9218793828892006, + "grad_norm": 1.879228401720701, + "learning_rate": 1.5919230078410064e-07, + "loss": 0.3128, + "step": 6573 + }, + { + "epoch": 0.9220196353436185, + "grad_norm": 2.382185556450233, + "learning_rate": 1.5862424679842614e-07, + "loss": 0.3585, + "step": 6574 + }, + { + "epoch": 0.9221598877980365, + "grad_norm": 1.6139994724372086, + "learning_rate": 1.5805719179681377e-07, + "loss": 0.3096, + "step": 6575 + }, + { + "epoch": 0.9223001402524544, + "grad_norm": 2.164650406836998, + "learning_rate": 1.5749113589627108e-07, + "loss": 0.3076, + "step": 6576 + }, + { + "epoch": 0.9224403927068724, + "grad_norm": 1.9358338606143035, + "learning_rate": 1.5692607921360014e-07, + "loss": 0.3191, + "step": 6577 + }, + { + "epoch": 0.9225806451612903, + "grad_norm": 1.9165922924483476, + "learning_rate": 1.5636202186539663e-07, + "loss": 0.3283, + "step": 6578 + }, + { + "epoch": 0.9227208976157083, + "grad_norm": 1.711457961537682, + "learning_rate": 1.557989639680496e-07, + "loss": 0.3365, + "step": 6579 + }, + { + "epoch": 0.9228611500701263, + "grad_norm": 2.067580122219716, + "learning_rate": 1.5523690563774175e-07, + "loss": 0.3176, + "step": 6580 + }, + { + "epoch": 0.9230014025245442, + "grad_norm": 1.844478980313012, + "learning_rate": 1.5467584699045024e-07, + "loss": 0.3226, + "step": 6581 + }, + { + "epoch": 0.9231416549789622, + "grad_norm": 1.7999907307915974, + "learning_rate": 1.5411578814194583e-07, + "loss": 0.332, + "step": 6582 + }, + { + "epoch": 0.9232819074333801, + "grad_norm": 2.083644106256067, + "learning_rate": 1.535567292077922e-07, + "loss": 0.332, + "step": 6583 + }, + { + "epoch": 0.9234221598877981, + "grad_norm": 2.34832398729741, + "learning_rate": 1.5299867030334815e-07, + "loss": 0.3739, + "step": 6584 + }, + { + "epoch": 0.923562412342216, + "grad_norm": 1.806899659428255, + "learning_rate": 1.5244161154376437e-07, + "loss": 0.3198, + "step": 6585 + }, + { + "epoch": 0.923702664796634, + "grad_norm": 2.1579342221478157, + "learning_rate": 1.518855530439861e-07, + "loss": 0.3349, + "step": 6586 + }, + { + "epoch": 0.9238429172510519, + "grad_norm": 1.8378865386885033, + "learning_rate": 1.5133049491875275e-07, + "loss": 0.3376, + "step": 6587 + }, + { + "epoch": 0.9239831697054699, + "grad_norm": 2.114911534688992, + "learning_rate": 1.5077643728259594e-07, + "loss": 0.3455, + "step": 6588 + }, + { + "epoch": 0.9241234221598879, + "grad_norm": 2.6276515103256743, + "learning_rate": 1.502233802498415e-07, + "loss": 0.3219, + "step": 6589 + }, + { + "epoch": 0.9242636746143057, + "grad_norm": 1.6685174758585788, + "learning_rate": 1.4967132393460983e-07, + "loss": 0.3253, + "step": 6590 + }, + { + "epoch": 0.9244039270687237, + "grad_norm": 1.5947954825221315, + "learning_rate": 1.491202684508136e-07, + "loss": 0.3282, + "step": 6591 + }, + { + "epoch": 0.9245441795231416, + "grad_norm": 2.08878352286138, + "learning_rate": 1.4857021391215865e-07, + "loss": 0.2955, + "step": 6592 + }, + { + "epoch": 0.9246844319775596, + "grad_norm": 1.7592033478662086, + "learning_rate": 1.4802116043214575e-07, + "loss": 0.3638, + "step": 6593 + }, + { + "epoch": 0.9248246844319775, + "grad_norm": 1.888648584295217, + "learning_rate": 1.4747310812406768e-07, + "loss": 0.2916, + "step": 6594 + }, + { + "epoch": 0.9249649368863955, + "grad_norm": 2.429384267841834, + "learning_rate": 1.4692605710101116e-07, + "loss": 0.2917, + "step": 6595 + }, + { + "epoch": 0.9251051893408134, + "grad_norm": 1.8467883275280157, + "learning_rate": 1.4638000747585646e-07, + "loss": 0.3678, + "step": 6596 + }, + { + "epoch": 0.9252454417952314, + "grad_norm": 1.9553472465170143, + "learning_rate": 1.4583495936127678e-07, + "loss": 0.3524, + "step": 6597 + }, + { + "epoch": 0.9253856942496493, + "grad_norm": 1.7344101223525308, + "learning_rate": 1.4529091286973994e-07, + "loss": 0.3559, + "step": 6598 + }, + { + "epoch": 0.9255259467040673, + "grad_norm": 2.2156352356323206, + "learning_rate": 1.447478681135056e-07, + "loss": 0.3386, + "step": 6599 + }, + { + "epoch": 0.9256661991584852, + "grad_norm": 1.813486291287178, + "learning_rate": 1.442058252046269e-07, + "loss": 0.3612, + "step": 6600 + }, + { + "epoch": 0.9258064516129032, + "grad_norm": 2.0140367504808836, + "learning_rate": 1.43664784254951e-07, + "loss": 0.3665, + "step": 6601 + }, + { + "epoch": 0.9259467040673212, + "grad_norm": 2.1154893208980035, + "learning_rate": 1.4312474537611752e-07, + "loss": 0.3432, + "step": 6602 + }, + { + "epoch": 0.9260869565217391, + "grad_norm": 1.420607032914001, + "learning_rate": 1.425857086795601e-07, + "loss": 0.3033, + "step": 6603 + }, + { + "epoch": 0.9262272089761571, + "grad_norm": 3.13226054744572, + "learning_rate": 1.420476742765059e-07, + "loss": 0.3281, + "step": 6604 + }, + { + "epoch": 0.926367461430575, + "grad_norm": 1.9455346895966856, + "learning_rate": 1.415106422779733e-07, + "loss": 0.3429, + "step": 6605 + }, + { + "epoch": 0.926507713884993, + "grad_norm": 1.8218811218263558, + "learning_rate": 1.409746127947753e-07, + "loss": 0.3259, + "step": 6606 + }, + { + "epoch": 0.9266479663394109, + "grad_norm": 2.150655371560418, + "learning_rate": 1.4043958593751794e-07, + "loss": 0.3263, + "step": 6607 + }, + { + "epoch": 0.9267882187938289, + "grad_norm": 1.6566892077539037, + "learning_rate": 1.3990556181660065e-07, + "loss": 0.3049, + "step": 6608 + }, + { + "epoch": 0.9269284712482468, + "grad_norm": 3.342102215192838, + "learning_rate": 1.3937254054221526e-07, + "loss": 0.3078, + "step": 6609 + }, + { + "epoch": 0.9270687237026648, + "grad_norm": 1.871233845286181, + "learning_rate": 1.388405222243472e-07, + "loss": 0.3534, + "step": 6610 + }, + { + "epoch": 0.9272089761570828, + "grad_norm": 1.6744606487699778, + "learning_rate": 1.3830950697277468e-07, + "loss": 0.3173, + "step": 6611 + }, + { + "epoch": 0.9273492286115007, + "grad_norm": 1.871660501102518, + "learning_rate": 1.3777949489706898e-07, + "loss": 0.3467, + "step": 6612 + }, + { + "epoch": 0.9274894810659187, + "grad_norm": 2.239072756267944, + "learning_rate": 1.3725048610659487e-07, + "loss": 0.3303, + "step": 6613 + }, + { + "epoch": 0.9276297335203366, + "grad_norm": 1.5782304483064018, + "learning_rate": 1.367224807105083e-07, + "loss": 0.2848, + "step": 6614 + }, + { + "epoch": 0.9277699859747546, + "grad_norm": 2.122793061386278, + "learning_rate": 1.3619547881776052e-07, + "loss": 0.2982, + "step": 6615 + }, + { + "epoch": 0.9279102384291725, + "grad_norm": 1.9083302513108775, + "learning_rate": 1.356694805370945e-07, + "loss": 0.3444, + "step": 6616 + }, + { + "epoch": 0.9280504908835905, + "grad_norm": 2.4186182582571405, + "learning_rate": 1.3514448597704623e-07, + "loss": 0.3216, + "step": 6617 + }, + { + "epoch": 0.9281907433380084, + "grad_norm": 2.068188538669029, + "learning_rate": 1.3462049524594456e-07, + "loss": 0.331, + "step": 6618 + }, + { + "epoch": 0.9283309957924264, + "grad_norm": 2.415676429270153, + "learning_rate": 1.3409750845191138e-07, + "loss": 0.3587, + "step": 6619 + }, + { + "epoch": 0.9284712482468443, + "grad_norm": 1.6002750339156029, + "learning_rate": 1.335755257028626e-07, + "loss": 0.2921, + "step": 6620 + }, + { + "epoch": 0.9286115007012623, + "grad_norm": 2.794741536099729, + "learning_rate": 1.330545471065031e-07, + "loss": 0.3061, + "step": 6621 + }, + { + "epoch": 0.9287517531556803, + "grad_norm": 1.9764234772895763, + "learning_rate": 1.3253457277033533e-07, + "loss": 0.3683, + "step": 6622 + }, + { + "epoch": 0.9288920056100982, + "grad_norm": 2.168578136131443, + "learning_rate": 1.3201560280165117e-07, + "loss": 0.3605, + "step": 6623 + }, + { + "epoch": 0.9290322580645162, + "grad_norm": 2.134183338323099, + "learning_rate": 1.3149763730753772e-07, + "loss": 0.338, + "step": 6624 + }, + { + "epoch": 0.9291725105189341, + "grad_norm": 1.782302679239299, + "learning_rate": 1.3098067639487232e-07, + "loss": 0.2931, + "step": 6625 + }, + { + "epoch": 0.9293127629733521, + "grad_norm": 1.9152591001162227, + "learning_rate": 1.3046472017032685e-07, + "loss": 0.3095, + "step": 6626 + }, + { + "epoch": 0.92945301542777, + "grad_norm": 1.9278451255823978, + "learning_rate": 1.2994976874036503e-07, + "loss": 0.2964, + "step": 6627 + }, + { + "epoch": 0.929593267882188, + "grad_norm": 1.8916573346393755, + "learning_rate": 1.2943582221124296e-07, + "loss": 0.3704, + "step": 6628 + }, + { + "epoch": 0.929733520336606, + "grad_norm": 3.2316006911023827, + "learning_rate": 1.2892288068901136e-07, + "loss": 0.3516, + "step": 6629 + }, + { + "epoch": 0.9298737727910238, + "grad_norm": 2.1125142791084883, + "learning_rate": 1.284109442795106e-07, + "loss": 0.337, + "step": 6630 + }, + { + "epoch": 0.9300140252454417, + "grad_norm": 2.1681403014895877, + "learning_rate": 1.2790001308837618e-07, + "loss": 0.3384, + "step": 6631 + }, + { + "epoch": 0.9301542776998597, + "grad_norm": 1.4015100067412902, + "learning_rate": 1.2739008722103486e-07, + "loss": 0.3236, + "step": 6632 + }, + { + "epoch": 0.9302945301542777, + "grad_norm": 2.4628169557714794, + "learning_rate": 1.2688116678270636e-07, + "loss": 0.3647, + "step": 6633 + }, + { + "epoch": 0.9304347826086956, + "grad_norm": 2.7381745016973635, + "learning_rate": 1.2637325187840332e-07, + "loss": 0.3959, + "step": 6634 + }, + { + "epoch": 0.9305750350631136, + "grad_norm": 1.8364509027005826, + "learning_rate": 1.2586634261292918e-07, + "loss": 0.319, + "step": 6635 + }, + { + "epoch": 0.9307152875175315, + "grad_norm": 1.6682706108586483, + "learning_rate": 1.253604390908819e-07, + "loss": 0.3287, + "step": 6636 + }, + { + "epoch": 0.9308555399719495, + "grad_norm": 2.8326916996594993, + "learning_rate": 1.2485554141665134e-07, + "loss": 0.3121, + "step": 6637 + }, + { + "epoch": 0.9309957924263674, + "grad_norm": 2.032210020468175, + "learning_rate": 1.2435164969441915e-07, + "loss": 0.2921, + "step": 6638 + }, + { + "epoch": 0.9311360448807854, + "grad_norm": 1.9140064464847182, + "learning_rate": 1.2384876402815993e-07, + "loss": 0.3429, + "step": 6639 + }, + { + "epoch": 0.9312762973352033, + "grad_norm": 2.026400131509312, + "learning_rate": 1.2334688452164122e-07, + "loss": 0.3125, + "step": 6640 + }, + { + "epoch": 0.9314165497896213, + "grad_norm": 1.953947885733953, + "learning_rate": 1.2284601127842187e-07, + "loss": 0.337, + "step": 6641 + }, + { + "epoch": 0.9315568022440393, + "grad_norm": 1.870401847827674, + "learning_rate": 1.2234614440185365e-07, + "loss": 0.3916, + "step": 6642 + }, + { + "epoch": 0.9316970546984572, + "grad_norm": 1.7539224942480975, + "learning_rate": 1.2184728399508016e-07, + "loss": 0.2934, + "step": 6643 + }, + { + "epoch": 0.9318373071528752, + "grad_norm": 1.8106569587960397, + "learning_rate": 1.2134943016103794e-07, + "loss": 0.3192, + "step": 6644 + }, + { + "epoch": 0.9319775596072931, + "grad_norm": 1.7074997038722306, + "learning_rate": 1.208525830024565e-07, + "loss": 0.3364, + "step": 6645 + }, + { + "epoch": 0.9321178120617111, + "grad_norm": 2.174307688882564, + "learning_rate": 1.2035674262185603e-07, + "loss": 0.3045, + "step": 6646 + }, + { + "epoch": 0.932258064516129, + "grad_norm": 1.4831958303405273, + "learning_rate": 1.198619091215497e-07, + "loss": 0.334, + "step": 6647 + }, + { + "epoch": 0.932398316970547, + "grad_norm": 2.7656068962927067, + "learning_rate": 1.1936808260364252e-07, + "loss": 0.3343, + "step": 6648 + }, + { + "epoch": 0.9325385694249649, + "grad_norm": 1.731182355970807, + "learning_rate": 1.1887526317003351e-07, + "loss": 0.3103, + "step": 6649 + }, + { + "epoch": 0.9326788218793829, + "grad_norm": 2.330214004839386, + "learning_rate": 1.1838345092241132e-07, + "loss": 0.3474, + "step": 6650 + }, + { + "epoch": 0.9328190743338008, + "grad_norm": 1.5974586569819582, + "learning_rate": 1.1789264596225814e-07, + "loss": 0.309, + "step": 6651 + }, + { + "epoch": 0.9329593267882188, + "grad_norm": 1.91959177247065, + "learning_rate": 1.1740284839084848e-07, + "loss": 0.3632, + "step": 6652 + }, + { + "epoch": 0.9330995792426368, + "grad_norm": 1.8530330049732362, + "learning_rate": 1.1691405830924873e-07, + "loss": 0.31, + "step": 6653 + }, + { + "epoch": 0.9332398316970547, + "grad_norm": 1.7005358272278972, + "learning_rate": 1.1642627581831767e-07, + "loss": 0.3176, + "step": 6654 + }, + { + "epoch": 0.9333800841514727, + "grad_norm": 2.117864971932148, + "learning_rate": 1.1593950101870422e-07, + "loss": 0.3399, + "step": 6655 + }, + { + "epoch": 0.9335203366058906, + "grad_norm": 1.7568738677768594, + "learning_rate": 1.1545373401085247e-07, + "loss": 0.3267, + "step": 6656 + }, + { + "epoch": 0.9336605890603086, + "grad_norm": 1.8403935673911973, + "learning_rate": 1.149689748949967e-07, + "loss": 0.324, + "step": 6657 + }, + { + "epoch": 0.9338008415147265, + "grad_norm": 1.8586898952019615, + "learning_rate": 1.14485223771163e-07, + "loss": 0.3058, + "step": 6658 + }, + { + "epoch": 0.9339410939691445, + "grad_norm": 2.1421602410182725, + "learning_rate": 1.1400248073917042e-07, + "loss": 0.2896, + "step": 6659 + }, + { + "epoch": 0.9340813464235624, + "grad_norm": 1.8791247456929219, + "learning_rate": 1.1352074589862983e-07, + "loss": 0.3249, + "step": 6660 + }, + { + "epoch": 0.9342215988779804, + "grad_norm": 2.6790799266800605, + "learning_rate": 1.1304001934894393e-07, + "loss": 0.2742, + "step": 6661 + }, + { + "epoch": 0.9343618513323984, + "grad_norm": 1.7939718262151678, + "learning_rate": 1.1256030118930727e-07, + "loss": 0.3381, + "step": 6662 + }, + { + "epoch": 0.9345021037868163, + "grad_norm": 1.650901652114266, + "learning_rate": 1.1208159151870567e-07, + "loss": 0.3402, + "step": 6663 + }, + { + "epoch": 0.9346423562412343, + "grad_norm": 1.7405745904769603, + "learning_rate": 1.116038904359179e-07, + "loss": 0.294, + "step": 6664 + }, + { + "epoch": 0.9347826086956522, + "grad_norm": 1.9748711869930793, + "learning_rate": 1.1112719803951455e-07, + "loss": 0.3481, + "step": 6665 + }, + { + "epoch": 0.9349228611500702, + "grad_norm": 2.0185098038672256, + "learning_rate": 1.1065151442785749e-07, + "loss": 0.3115, + "step": 6666 + }, + { + "epoch": 0.9350631136044881, + "grad_norm": 2.0087902493767764, + "learning_rate": 1.1017683969910042e-07, + "loss": 0.3115, + "step": 6667 + }, + { + "epoch": 0.9352033660589061, + "grad_norm": 2.3521472292640966, + "learning_rate": 1.0970317395119001e-07, + "loss": 0.3321, + "step": 6668 + }, + { + "epoch": 0.935343618513324, + "grad_norm": 2.2172591882666493, + "learning_rate": 1.0923051728186251e-07, + "loss": 0.3421, + "step": 6669 + }, + { + "epoch": 0.9354838709677419, + "grad_norm": 2.1276466493011066, + "learning_rate": 1.0875886978864881e-07, + "loss": 0.3583, + "step": 6670 + }, + { + "epoch": 0.9356241234221598, + "grad_norm": 1.8247800041227011, + "learning_rate": 1.0828823156886881e-07, + "loss": 0.3307, + "step": 6671 + }, + { + "epoch": 0.9357643758765778, + "grad_norm": 1.9219088667700115, + "learning_rate": 1.0781860271963651e-07, + "loss": 0.3331, + "step": 6672 + }, + { + "epoch": 0.9359046283309957, + "grad_norm": 1.9015074346589105, + "learning_rate": 1.0734998333785607e-07, + "loss": 0.3184, + "step": 6673 + }, + { + "epoch": 0.9360448807854137, + "grad_norm": 2.6042317395027363, + "learning_rate": 1.0688237352022346e-07, + "loss": 0.4029, + "step": 6674 + }, + { + "epoch": 0.9361851332398317, + "grad_norm": 1.5007378035238323, + "learning_rate": 1.0641577336322761e-07, + "loss": 0.3433, + "step": 6675 + }, + { + "epoch": 0.9363253856942496, + "grad_norm": 2.0792788407129343, + "learning_rate": 1.0595018296314763e-07, + "loss": 0.321, + "step": 6676 + }, + { + "epoch": 0.9364656381486676, + "grad_norm": 2.0656938965542486, + "learning_rate": 1.0548560241605444e-07, + "loss": 0.3342, + "step": 6677 + }, + { + "epoch": 0.9366058906030855, + "grad_norm": 1.480984157737675, + "learning_rate": 1.0502203181781135e-07, + "loss": 0.2905, + "step": 6678 + }, + { + "epoch": 0.9367461430575035, + "grad_norm": 2.0277413215437385, + "learning_rate": 1.0455947126407296e-07, + "loss": 0.3149, + "step": 6679 + }, + { + "epoch": 0.9368863955119214, + "grad_norm": 2.0502269758781364, + "learning_rate": 1.040979208502857e-07, + "loss": 0.3087, + "step": 6680 + }, + { + "epoch": 0.9370266479663394, + "grad_norm": 2.528514184219136, + "learning_rate": 1.0363738067168672e-07, + "loss": 0.3182, + "step": 6681 + }, + { + "epoch": 0.9371669004207573, + "grad_norm": 1.968403437553339, + "learning_rate": 1.0317785082330555e-07, + "loss": 0.3744, + "step": 6682 + }, + { + "epoch": 0.9373071528751753, + "grad_norm": 2.2301071060871407, + "learning_rate": 1.027193313999636e-07, + "loss": 0.2949, + "step": 6683 + }, + { + "epoch": 0.9374474053295933, + "grad_norm": 1.750376660819731, + "learning_rate": 1.0226182249627181e-07, + "loss": 0.3229, + "step": 6684 + }, + { + "epoch": 0.9375876577840112, + "grad_norm": 7.738464137635623, + "learning_rate": 1.0180532420663525e-07, + "loss": 0.3512, + "step": 6685 + }, + { + "epoch": 0.9377279102384292, + "grad_norm": 2.8562025804986537, + "learning_rate": 1.0134983662524856e-07, + "loss": 0.3003, + "step": 6686 + }, + { + "epoch": 0.9378681626928471, + "grad_norm": 2.171436292950088, + "learning_rate": 1.0089535984609766e-07, + "loss": 0.2919, + "step": 6687 + }, + { + "epoch": 0.9380084151472651, + "grad_norm": 1.5988770836017374, + "learning_rate": 1.0044189396296144e-07, + "loss": 0.3163, + "step": 6688 + }, + { + "epoch": 0.938148667601683, + "grad_norm": 2.519233248334775, + "learning_rate": 9.998943906941005e-08, + "loss": 0.3628, + "step": 6689 + }, + { + "epoch": 0.938288920056101, + "grad_norm": 1.7104082377602061, + "learning_rate": 9.953799525880325e-08, + "loss": 0.2953, + "step": 6690 + }, + { + "epoch": 0.9384291725105189, + "grad_norm": 1.9062556282335585, + "learning_rate": 9.908756262429376e-08, + "loss": 0.3263, + "step": 6691 + }, + { + "epoch": 0.9385694249649369, + "grad_norm": 1.4382548630763323, + "learning_rate": 9.863814125882498e-08, + "loss": 0.3036, + "step": 6692 + }, + { + "epoch": 0.9387096774193548, + "grad_norm": 2.4152025413482123, + "learning_rate": 9.818973125513275e-08, + "loss": 0.3388, + "step": 6693 + }, + { + "epoch": 0.9388499298737728, + "grad_norm": 2.0095498253690725, + "learning_rate": 9.774233270574252e-08, + "loss": 0.34, + "step": 6694 + }, + { + "epoch": 0.9389901823281908, + "grad_norm": 1.9968226346834501, + "learning_rate": 9.729594570297207e-08, + "loss": 0.3456, + "step": 6695 + }, + { + "epoch": 0.9391304347826087, + "grad_norm": 2.3881906319379778, + "learning_rate": 9.685057033892998e-08, + "loss": 0.3246, + "step": 6696 + }, + { + "epoch": 0.9392706872370267, + "grad_norm": 2.515793883536749, + "learning_rate": 9.640620670551659e-08, + "loss": 0.3152, + "step": 6697 + }, + { + "epoch": 0.9394109396914446, + "grad_norm": 1.6561729125311107, + "learning_rate": 9.596285489442359e-08, + "loss": 0.365, + "step": 6698 + }, + { + "epoch": 0.9395511921458626, + "grad_norm": 1.8681782284731467, + "learning_rate": 9.552051499713278e-08, + "loss": 0.3052, + "step": 6699 + }, + { + "epoch": 0.9396914446002805, + "grad_norm": 2.5307647805155846, + "learning_rate": 9.507918710491838e-08, + "loss": 0.4002, + "step": 6700 + }, + { + "epoch": 0.9398316970546985, + "grad_norm": 1.705043656917329, + "learning_rate": 9.46388713088453e-08, + "loss": 0.3678, + "step": 6701 + }, + { + "epoch": 0.9399719495091164, + "grad_norm": 1.6747395583409663, + "learning_rate": 9.419956769976979e-08, + "loss": 0.3711, + "step": 6702 + }, + { + "epoch": 0.9401122019635344, + "grad_norm": 1.8401993005042068, + "learning_rate": 9.376127636833876e-08, + "loss": 0.3355, + "step": 6703 + }, + { + "epoch": 0.9402524544179524, + "grad_norm": 1.5423021639138137, + "learning_rate": 9.332399740499043e-08, + "loss": 0.3463, + "step": 6704 + }, + { + "epoch": 0.9403927068723703, + "grad_norm": 1.5704777568953194, + "learning_rate": 9.288773089995484e-08, + "loss": 0.2712, + "step": 6705 + }, + { + "epoch": 0.9405329593267883, + "grad_norm": 2.2328915239662432, + "learning_rate": 9.245247694325166e-08, + "loss": 0.3356, + "step": 6706 + }, + { + "epoch": 0.9406732117812062, + "grad_norm": 1.823704649143701, + "learning_rate": 9.201823562469347e-08, + "loss": 0.3364, + "step": 6707 + }, + { + "epoch": 0.9408134642356242, + "grad_norm": 3.1361528886194145, + "learning_rate": 9.158500703388252e-08, + "loss": 0.3633, + "step": 6708 + }, + { + "epoch": 0.9409537166900421, + "grad_norm": 1.9773325268303703, + "learning_rate": 9.115279126021226e-08, + "loss": 0.2843, + "step": 6709 + }, + { + "epoch": 0.94109396914446, + "grad_norm": 1.685644047523068, + "learning_rate": 9.072158839286748e-08, + "loss": 0.3549, + "step": 6710 + }, + { + "epoch": 0.9412342215988779, + "grad_norm": 1.6767550397882214, + "learning_rate": 9.029139852082425e-08, + "loss": 0.2785, + "step": 6711 + }, + { + "epoch": 0.9413744740532959, + "grad_norm": 1.8905452364655115, + "learning_rate": 8.986222173284876e-08, + "loss": 0.3298, + "step": 6712 + }, + { + "epoch": 0.9415147265077138, + "grad_norm": 2.266881870357177, + "learning_rate": 8.94340581174985e-08, + "loss": 0.3333, + "step": 6713 + }, + { + "epoch": 0.9416549789621318, + "grad_norm": 1.9823558476919985, + "learning_rate": 8.900690776312282e-08, + "loss": 0.3121, + "step": 6714 + }, + { + "epoch": 0.9417952314165497, + "grad_norm": 2.0148965864882356, + "learning_rate": 8.85807707578612e-08, + "loss": 0.2502, + "step": 6715 + }, + { + "epoch": 0.9419354838709677, + "grad_norm": 1.6007710059505142, + "learning_rate": 8.815564718964331e-08, + "loss": 0.3006, + "step": 6716 + }, + { + "epoch": 0.9420757363253857, + "grad_norm": 1.899377597958946, + "learning_rate": 8.773153714619064e-08, + "loss": 0.3231, + "step": 6717 + }, + { + "epoch": 0.9422159887798036, + "grad_norm": 1.8475743733303898, + "learning_rate": 8.730844071501599e-08, + "loss": 0.3441, + "step": 6718 + }, + { + "epoch": 0.9423562412342216, + "grad_norm": 1.6455136805579718, + "learning_rate": 8.688635798342116e-08, + "loss": 0.3173, + "step": 6719 + }, + { + "epoch": 0.9424964936886395, + "grad_norm": 1.6979893237835335, + "learning_rate": 8.646528903850093e-08, + "loss": 0.3171, + "step": 6720 + }, + { + "epoch": 0.9426367461430575, + "grad_norm": 2.4574258605701567, + "learning_rate": 8.604523396713915e-08, + "loss": 0.3302, + "step": 6721 + }, + { + "epoch": 0.9427769985974754, + "grad_norm": 2.1320135275892906, + "learning_rate": 8.562619285601259e-08, + "loss": 0.3699, + "step": 6722 + }, + { + "epoch": 0.9429172510518934, + "grad_norm": 1.8041230346948087, + "learning_rate": 8.520816579158598e-08, + "loss": 0.3006, + "step": 6723 + }, + { + "epoch": 0.9430575035063113, + "grad_norm": 1.741294871653738, + "learning_rate": 8.479115286011752e-08, + "loss": 0.3084, + "step": 6724 + }, + { + "epoch": 0.9431977559607293, + "grad_norm": 2.2473719401468952, + "learning_rate": 8.437515414765341e-08, + "loss": 0.3175, + "step": 6725 + }, + { + "epoch": 0.9433380084151473, + "grad_norm": 5.341308214337735, + "learning_rate": 8.396016974003385e-08, + "loss": 0.3475, + "step": 6726 + }, + { + "epoch": 0.9434782608695652, + "grad_norm": 2.320098147782597, + "learning_rate": 8.354619972288703e-08, + "loss": 0.3404, + "step": 6727 + }, + { + "epoch": 0.9436185133239832, + "grad_norm": 1.706750632818319, + "learning_rate": 8.313324418163238e-08, + "loss": 0.3166, + "step": 6728 + }, + { + "epoch": 0.9437587657784011, + "grad_norm": 1.4860137727337914, + "learning_rate": 8.272130320148063e-08, + "loss": 0.2915, + "step": 6729 + }, + { + "epoch": 0.9438990182328191, + "grad_norm": 2.1778567703133076, + "learning_rate": 8.231037686743326e-08, + "loss": 0.3054, + "step": 6730 + }, + { + "epoch": 0.944039270687237, + "grad_norm": 2.0111889187752587, + "learning_rate": 8.190046526428241e-08, + "loss": 0.2894, + "step": 6731 + }, + { + "epoch": 0.944179523141655, + "grad_norm": 2.107421875, + "learning_rate": 8.149156847660933e-08, + "loss": 0.3369, + "step": 6732 + }, + { + "epoch": 0.9443197755960729, + "grad_norm": 1.9532220434875733, + "learning_rate": 8.108368658878818e-08, + "loss": 0.3248, + "step": 6733 + }, + { + "epoch": 0.9444600280504909, + "grad_norm": 5.560519090749869, + "learning_rate": 8.067681968498164e-08, + "loss": 0.3306, + "step": 6734 + }, + { + "epoch": 0.9446002805049089, + "grad_norm": 1.8531577404918933, + "learning_rate": 8.027096784914479e-08, + "loss": 0.3191, + "step": 6735 + }, + { + "epoch": 0.9447405329593268, + "grad_norm": 2.5112560552027507, + "learning_rate": 7.986613116502173e-08, + "loss": 0.3602, + "step": 6736 + }, + { + "epoch": 0.9448807854137448, + "grad_norm": 2.0638125318068186, + "learning_rate": 7.946230971614732e-08, + "loss": 0.3504, + "step": 6737 + }, + { + "epoch": 0.9450210378681627, + "grad_norm": 1.9234644202324795, + "learning_rate": 7.905950358584768e-08, + "loss": 0.3603, + "step": 6738 + }, + { + "epoch": 0.9451612903225807, + "grad_norm": 1.640491371162321, + "learning_rate": 7.865771285723911e-08, + "loss": 0.3496, + "step": 6739 + }, + { + "epoch": 0.9453015427769986, + "grad_norm": 2.5441573920415648, + "learning_rate": 7.825693761322861e-08, + "loss": 0.3406, + "step": 6740 + }, + { + "epoch": 0.9454417952314166, + "grad_norm": 2.2453493161723728, + "learning_rate": 7.785717793651282e-08, + "loss": 0.3872, + "step": 6741 + }, + { + "epoch": 0.9455820476858345, + "grad_norm": 1.8699859334039226, + "learning_rate": 7.745843390957908e-08, + "loss": 0.3319, + "step": 6742 + }, + { + "epoch": 0.9457223001402525, + "grad_norm": 2.010658236469907, + "learning_rate": 7.706070561470657e-08, + "loss": 0.3546, + "step": 6743 + }, + { + "epoch": 0.9458625525946704, + "grad_norm": 1.8456122481967792, + "learning_rate": 7.666399313396245e-08, + "loss": 0.3384, + "step": 6744 + }, + { + "epoch": 0.9460028050490884, + "grad_norm": 1.9208561127057369, + "learning_rate": 7.626829654920732e-08, + "loss": 0.3176, + "step": 6745 + }, + { + "epoch": 0.9461430575035064, + "grad_norm": 2.0927113476192423, + "learning_rate": 7.587361594208808e-08, + "loss": 0.3233, + "step": 6746 + }, + { + "epoch": 0.9462833099579243, + "grad_norm": 2.2825182172167366, + "learning_rate": 7.54799513940463e-08, + "loss": 0.32, + "step": 6747 + }, + { + "epoch": 0.9464235624123423, + "grad_norm": 1.4624331564992614, + "learning_rate": 7.508730298631084e-08, + "loss": 0.2874, + "step": 6748 + }, + { + "epoch": 0.9465638148667602, + "grad_norm": 1.6642034050113415, + "learning_rate": 7.469567079990248e-08, + "loss": 0.353, + "step": 6749 + }, + { + "epoch": 0.9467040673211781, + "grad_norm": 1.8105449655928105, + "learning_rate": 7.430505491563101e-08, + "loss": 0.3307, + "step": 6750 + }, + { + "epoch": 0.946844319775596, + "grad_norm": 2.0276922907583823, + "learning_rate": 7.391545541409806e-08, + "loss": 0.2869, + "step": 6751 + }, + { + "epoch": 0.946984572230014, + "grad_norm": 2.3169712451900644, + "learning_rate": 7.352687237569489e-08, + "loss": 0.2957, + "step": 6752 + }, + { + "epoch": 0.9471248246844319, + "grad_norm": 1.8697647758682472, + "learning_rate": 7.31393058806018e-08, + "loss": 0.3187, + "step": 6753 + }, + { + "epoch": 0.9472650771388499, + "grad_norm": 1.7739001103461847, + "learning_rate": 7.275275600879206e-08, + "loss": 0.3277, + "step": 6754 + }, + { + "epoch": 0.9474053295932678, + "grad_norm": 2.775332467035578, + "learning_rate": 7.236722284002573e-08, + "loss": 0.3454, + "step": 6755 + }, + { + "epoch": 0.9475455820476858, + "grad_norm": 1.9544554794522409, + "learning_rate": 7.198270645385641e-08, + "loss": 0.3635, + "step": 6756 + }, + { + "epoch": 0.9476858345021038, + "grad_norm": 2.2120813437020765, + "learning_rate": 7.159920692962563e-08, + "loss": 0.342, + "step": 6757 + }, + { + "epoch": 0.9478260869565217, + "grad_norm": 3.272276315060281, + "learning_rate": 7.12167243464662e-08, + "loss": 0.2782, + "step": 6758 + }, + { + "epoch": 0.9479663394109397, + "grad_norm": 2.099498089347273, + "learning_rate": 7.08352587833e-08, + "loss": 0.325, + "step": 6759 + }, + { + "epoch": 0.9481065918653576, + "grad_norm": 1.919260992044781, + "learning_rate": 7.045481031884071e-08, + "loss": 0.3272, + "step": 6760 + }, + { + "epoch": 0.9482468443197756, + "grad_norm": 1.8680636852076475, + "learning_rate": 7.007537903159057e-08, + "loss": 0.3586, + "step": 6761 + }, + { + "epoch": 0.9483870967741935, + "grad_norm": 2.195413281716921, + "learning_rate": 6.969696499984246e-08, + "loss": 0.3797, + "step": 6762 + }, + { + "epoch": 0.9485273492286115, + "grad_norm": 1.8759226754038234, + "learning_rate": 6.931956830168007e-08, + "loss": 0.3632, + "step": 6763 + }, + { + "epoch": 0.9486676016830294, + "grad_norm": 2.071448939087052, + "learning_rate": 6.894318901497665e-08, + "loss": 0.3281, + "step": 6764 + }, + { + "epoch": 0.9488078541374474, + "grad_norm": 1.8984849711946763, + "learning_rate": 6.856782721739452e-08, + "loss": 0.31, + "step": 6765 + }, + { + "epoch": 0.9489481065918653, + "grad_norm": 1.8518359147375056, + "learning_rate": 6.819348298638839e-08, + "loss": 0.3042, + "step": 6766 + }, + { + "epoch": 0.9490883590462833, + "grad_norm": 2.034163044143987, + "learning_rate": 6.782015639919982e-08, + "loss": 0.3073, + "step": 6767 + }, + { + "epoch": 0.9492286115007013, + "grad_norm": 1.9289667428265918, + "learning_rate": 6.744784753286382e-08, + "loss": 0.317, + "step": 6768 + }, + { + "epoch": 0.9493688639551192, + "grad_norm": 1.9879177038440172, + "learning_rate": 6.70765564642023e-08, + "loss": 0.345, + "step": 6769 + }, + { + "epoch": 0.9495091164095372, + "grad_norm": 1.8874829878893036, + "learning_rate": 6.670628326982953e-08, + "loss": 0.3215, + "step": 6770 + }, + { + "epoch": 0.9496493688639551, + "grad_norm": 1.822936561112609, + "learning_rate": 6.633702802614828e-08, + "loss": 0.323, + "step": 6771 + }, + { + "epoch": 0.9497896213183731, + "grad_norm": 1.9085431298731017, + "learning_rate": 6.596879080935203e-08, + "loss": 0.3265, + "step": 6772 + }, + { + "epoch": 0.949929873772791, + "grad_norm": 2.0445116682794837, + "learning_rate": 6.560157169542391e-08, + "loss": 0.3244, + "step": 6773 + }, + { + "epoch": 0.950070126227209, + "grad_norm": 2.3498078450310222, + "learning_rate": 6.52353707601372e-08, + "loss": 0.3152, + "step": 6774 + }, + { + "epoch": 0.9502103786816269, + "grad_norm": 2.849720650082794, + "learning_rate": 6.487018807905421e-08, + "loss": 0.3192, + "step": 6775 + }, + { + "epoch": 0.9503506311360449, + "grad_norm": 2.029606315510344, + "learning_rate": 6.450602372752912e-08, + "loss": 0.3417, + "step": 6776 + }, + { + "epoch": 0.9504908835904629, + "grad_norm": 2.4383671514113265, + "learning_rate": 6.414287778070404e-08, + "loss": 0.3406, + "step": 6777 + }, + { + "epoch": 0.9506311360448808, + "grad_norm": 1.958311960090176, + "learning_rate": 6.378075031351072e-08, + "loss": 0.2919, + "step": 6778 + }, + { + "epoch": 0.9507713884992988, + "grad_norm": 1.6491996124328188, + "learning_rate": 6.34196414006727e-08, + "loss": 0.3129, + "step": 6779 + }, + { + "epoch": 0.9509116409537167, + "grad_norm": 1.8340383243446128, + "learning_rate": 6.305955111670204e-08, + "loss": 0.3265, + "step": 6780 + }, + { + "epoch": 0.9510518934081347, + "grad_norm": 2.1330844580002353, + "learning_rate": 6.270047953590097e-08, + "loss": 0.301, + "step": 6781 + }, + { + "epoch": 0.9511921458625526, + "grad_norm": 1.9498437403157618, + "learning_rate": 6.234242673236079e-08, + "loss": 0.2775, + "step": 6782 + }, + { + "epoch": 0.9513323983169706, + "grad_norm": 2.071992012965888, + "learning_rate": 6.198539277996407e-08, + "loss": 0.3433, + "step": 6783 + }, + { + "epoch": 0.9514726507713885, + "grad_norm": 1.7451211494866188, + "learning_rate": 6.162937775238187e-08, + "loss": 0.3545, + "step": 6784 + }, + { + "epoch": 0.9516129032258065, + "grad_norm": 1.754293148467464, + "learning_rate": 6.127438172307487e-08, + "loss": 0.2823, + "step": 6785 + }, + { + "epoch": 0.9517531556802244, + "grad_norm": 2.6037076520108573, + "learning_rate": 6.092040476529504e-08, + "loss": 0.3298, + "step": 6786 + }, + { + "epoch": 0.9518934081346424, + "grad_norm": 2.2871887036906866, + "learning_rate": 6.056744695208283e-08, + "loss": 0.3013, + "step": 6787 + }, + { + "epoch": 0.9520336605890604, + "grad_norm": 1.8678150479122384, + "learning_rate": 6.021550835626777e-08, + "loss": 0.3013, + "step": 6788 + }, + { + "epoch": 0.9521739130434783, + "grad_norm": 1.8589458451162781, + "learning_rate": 5.986458905047066e-08, + "loss": 0.3179, + "step": 6789 + }, + { + "epoch": 0.9523141654978962, + "grad_norm": 2.5860782826988875, + "learning_rate": 5.9514689107101345e-08, + "loss": 0.3439, + "step": 6790 + }, + { + "epoch": 0.9524544179523141, + "grad_norm": 2.005740272678113, + "learning_rate": 5.9165808598358745e-08, + "loss": 0.3183, + "step": 6791 + }, + { + "epoch": 0.9525946704067321, + "grad_norm": 1.9104044934310975, + "learning_rate": 5.881794759623194e-08, + "loss": 0.3752, + "step": 6792 + }, + { + "epoch": 0.95273492286115, + "grad_norm": 1.8990240878384865, + "learning_rate": 5.8471106172499625e-08, + "loss": 0.3657, + "step": 6793 + }, + { + "epoch": 0.952875175315568, + "grad_norm": 1.7161652984055515, + "learning_rate": 5.8125284398730666e-08, + "loss": 0.3108, + "step": 6794 + }, + { + "epoch": 0.9530154277699859, + "grad_norm": 1.8565069566632817, + "learning_rate": 5.778048234628242e-08, + "loss": 0.3812, + "step": 6795 + }, + { + "epoch": 0.9531556802244039, + "grad_norm": 1.7967060838998146, + "learning_rate": 5.743670008630298e-08, + "loss": 0.3461, + "step": 6796 + }, + { + "epoch": 0.9532959326788218, + "grad_norm": 1.8520707989995513, + "learning_rate": 5.709393768972837e-08, + "loss": 0.2878, + "step": 6797 + }, + { + "epoch": 0.9534361851332398, + "grad_norm": 3.3518715029102, + "learning_rate": 5.675219522728648e-08, + "loss": 0.3628, + "step": 6798 + }, + { + "epoch": 0.9535764375876578, + "grad_norm": 2.1697678258071105, + "learning_rate": 5.6411472769492547e-08, + "loss": 0.323, + "step": 6799 + }, + { + "epoch": 0.9537166900420757, + "grad_norm": 1.7234031805251036, + "learning_rate": 5.607177038665257e-08, + "loss": 0.3337, + "step": 6800 + }, + { + "epoch": 0.9538569424964937, + "grad_norm": 1.8849138745128529, + "learning_rate": 5.573308814886158e-08, + "loss": 0.3142, + "step": 6801 + }, + { + "epoch": 0.9539971949509116, + "grad_norm": 2.0951984007489544, + "learning_rate": 5.539542612600479e-08, + "loss": 0.2973, + "step": 6802 + }, + { + "epoch": 0.9541374474053296, + "grad_norm": 1.8343039529744198, + "learning_rate": 5.5058784387755915e-08, + "loss": 0.3276, + "step": 6803 + }, + { + "epoch": 0.9542776998597475, + "grad_norm": 1.6309108337689948, + "learning_rate": 5.472316300357883e-08, + "loss": 0.3331, + "step": 6804 + }, + { + "epoch": 0.9544179523141655, + "grad_norm": 2.203490098502096, + "learning_rate": 5.438856204272647e-08, + "loss": 0.3487, + "step": 6805 + }, + { + "epoch": 0.9545582047685834, + "grad_norm": 1.9693165524016785, + "learning_rate": 5.405498157424194e-08, + "loss": 0.308, + "step": 6806 + }, + { + "epoch": 0.9546984572230014, + "grad_norm": 1.6907367746912394, + "learning_rate": 5.372242166695685e-08, + "loss": 0.3226, + "step": 6807 + }, + { + "epoch": 0.9548387096774194, + "grad_norm": 2.3080101760163916, + "learning_rate": 5.339088238949186e-08, + "loss": 0.3229, + "step": 6808 + }, + { + "epoch": 0.9549789621318373, + "grad_norm": 2.465026268345145, + "learning_rate": 5.3060363810259475e-08, + "loss": 0.3406, + "step": 6809 + }, + { + "epoch": 0.9551192145862553, + "grad_norm": 2.2878749227481587, + "learning_rate": 5.273086599745847e-08, + "loss": 0.3091, + "step": 6810 + }, + { + "epoch": 0.9552594670406732, + "grad_norm": 1.6893987922531368, + "learning_rate": 5.2402389019078904e-08, + "loss": 0.3075, + "step": 6811 + }, + { + "epoch": 0.9553997194950912, + "grad_norm": 2.1230025159593606, + "learning_rate": 5.207493294289989e-08, + "loss": 0.3184, + "step": 6812 + }, + { + "epoch": 0.9555399719495091, + "grad_norm": 1.8442982004141253, + "learning_rate": 5.174849783648905e-08, + "loss": 0.3197, + "step": 6813 + }, + { + "epoch": 0.9556802244039271, + "grad_norm": 1.830772829869067, + "learning_rate": 5.142308376720473e-08, + "loss": 0.3005, + "step": 6814 + }, + { + "epoch": 0.955820476858345, + "grad_norm": 1.9777450341229956, + "learning_rate": 5.109869080219376e-08, + "loss": 0.3054, + "step": 6815 + }, + { + "epoch": 0.955960729312763, + "grad_norm": 1.7234996711812538, + "learning_rate": 5.0775319008392054e-08, + "loss": 0.3606, + "step": 6816 + }, + { + "epoch": 0.956100981767181, + "grad_norm": 1.7733903786200882, + "learning_rate": 5.045296845252512e-08, + "loss": 0.3299, + "step": 6817 + }, + { + "epoch": 0.9562412342215989, + "grad_norm": 1.7217231697394833, + "learning_rate": 5.013163920110864e-08, + "loss": 0.3107, + "step": 6818 + }, + { + "epoch": 0.9563814866760169, + "grad_norm": 2.526474769404399, + "learning_rate": 4.9811331320445135e-08, + "loss": 0.2892, + "step": 6819 + }, + { + "epoch": 0.9565217391304348, + "grad_norm": 2.4136077551312853, + "learning_rate": 4.9492044876628396e-08, + "loss": 0.3401, + "step": 6820 + }, + { + "epoch": 0.9566619915848528, + "grad_norm": 1.9785941316444133, + "learning_rate": 4.917377993554184e-08, + "loss": 0.352, + "step": 6821 + }, + { + "epoch": 0.9568022440392707, + "grad_norm": 1.9318916612818058, + "learning_rate": 4.885653656285627e-08, + "loss": 0.3556, + "step": 6822 + }, + { + "epoch": 0.9569424964936887, + "grad_norm": 2.1368134807111447, + "learning_rate": 4.854031482403321e-08, + "loss": 0.2664, + "step": 6823 + }, + { + "epoch": 0.9570827489481066, + "grad_norm": 1.8959370539312435, + "learning_rate": 4.822511478432212e-08, + "loss": 0.3424, + "step": 6824 + }, + { + "epoch": 0.9572230014025246, + "grad_norm": 1.882591772793428, + "learning_rate": 4.791093650876322e-08, + "loss": 0.339, + "step": 6825 + }, + { + "epoch": 0.9573632538569425, + "grad_norm": 1.6518307874855795, + "learning_rate": 4.759778006218407e-08, + "loss": 0.3265, + "step": 6826 + }, + { + "epoch": 0.9575035063113605, + "grad_norm": 1.5995230738796147, + "learning_rate": 4.7285645509203e-08, + "loss": 0.3134, + "step": 6827 + }, + { + "epoch": 0.9576437587657785, + "grad_norm": 2.7837506314548004, + "learning_rate": 4.6974532914226825e-08, + "loss": 0.3061, + "step": 6828 + }, + { + "epoch": 0.9577840112201964, + "grad_norm": 3.8986960652134877, + "learning_rate": 4.666444234145084e-08, + "loss": 0.3608, + "step": 6829 + }, + { + "epoch": 0.9579242636746143, + "grad_norm": 1.789151718380436, + "learning_rate": 4.635537385486111e-08, + "loss": 0.3235, + "step": 6830 + }, + { + "epoch": 0.9580645161290322, + "grad_norm": 2.003017771405416, + "learning_rate": 4.604732751823049e-08, + "loss": 0.3241, + "step": 6831 + }, + { + "epoch": 0.9582047685834502, + "grad_norm": 1.6175367927199098, + "learning_rate": 4.5740303395122585e-08, + "loss": 0.3639, + "step": 6832 + }, + { + "epoch": 0.9583450210378681, + "grad_norm": 1.8295555712195632, + "learning_rate": 4.543430154889006e-08, + "loss": 0.3156, + "step": 6833 + }, + { + "epoch": 0.9584852734922861, + "grad_norm": 1.8207387138707252, + "learning_rate": 4.512932204267406e-08, + "loss": 0.3598, + "step": 6834 + }, + { + "epoch": 0.958625525946704, + "grad_norm": 2.015989048928081, + "learning_rate": 4.482536493940537e-08, + "loss": 0.351, + "step": 6835 + }, + { + "epoch": 0.958765778401122, + "grad_norm": 2.0607370876364484, + "learning_rate": 4.45224303018027e-08, + "loss": 0.3272, + "step": 6836 + }, + { + "epoch": 0.9589060308555399, + "grad_norm": 2.244964474947079, + "learning_rate": 4.422051819237494e-08, + "loss": 0.3191, + "step": 6837 + }, + { + "epoch": 0.9590462833099579, + "grad_norm": 2.6168137012970836, + "learning_rate": 4.3919628673418926e-08, + "loss": 0.3311, + "step": 6838 + }, + { + "epoch": 0.9591865357643758, + "grad_norm": 2.081305686961756, + "learning_rate": 4.361976180702221e-08, + "loss": 0.2902, + "step": 6839 + }, + { + "epoch": 0.9593267882187938, + "grad_norm": 1.8400436908261635, + "learning_rate": 4.3320917655059744e-08, + "loss": 0.335, + "step": 6840 + }, + { + "epoch": 0.9594670406732118, + "grad_norm": 2.861753547992175, + "learning_rate": 4.3023096279195544e-08, + "loss": 0.315, + "step": 6841 + }, + { + "epoch": 0.9596072931276297, + "grad_norm": 1.7847379031484063, + "learning_rate": 4.2726297740883215e-08, + "loss": 0.3287, + "step": 6842 + }, + { + "epoch": 0.9597475455820477, + "grad_norm": 2.209708475416086, + "learning_rate": 4.2430522101364894e-08, + "loss": 0.3106, + "step": 6843 + }, + { + "epoch": 0.9598877980364656, + "grad_norm": 1.878955356646157, + "learning_rate": 4.21357694216723e-08, + "loss": 0.3633, + "step": 6844 + }, + { + "epoch": 0.9600280504908836, + "grad_norm": 1.6570793090930034, + "learning_rate": 4.184203976262513e-08, + "loss": 0.3415, + "step": 6845 + }, + { + "epoch": 0.9601683029453015, + "grad_norm": 2.73784500108004, + "learning_rate": 4.1549333184832675e-08, + "loss": 0.3163, + "step": 6846 + }, + { + "epoch": 0.9603085553997195, + "grad_norm": 1.6604915504688564, + "learning_rate": 4.1257649748693284e-08, + "loss": 0.3373, + "step": 6847 + }, + { + "epoch": 0.9604488078541374, + "grad_norm": 1.9855255035745194, + "learning_rate": 4.0966989514392705e-08, + "loss": 0.2942, + "step": 6848 + }, + { + "epoch": 0.9605890603085554, + "grad_norm": 1.9093395273562788, + "learning_rate": 4.0677352541907963e-08, + "loss": 0.3283, + "step": 6849 + }, + { + "epoch": 0.9607293127629734, + "grad_norm": 2.351842413683376, + "learning_rate": 4.038873889100237e-08, + "loss": 0.3659, + "step": 6850 + }, + { + "epoch": 0.9608695652173913, + "grad_norm": 3.192796980771688, + "learning_rate": 4.010114862123049e-08, + "loss": 0.3652, + "step": 6851 + }, + { + "epoch": 0.9610098176718093, + "grad_norm": 2.9252670101006677, + "learning_rate": 3.981458179193321e-08, + "loss": 0.2928, + "step": 6852 + }, + { + "epoch": 0.9611500701262272, + "grad_norm": 2.4607695022642666, + "learning_rate": 3.952903846224265e-08, + "loss": 0.3353, + "step": 6853 + }, + { + "epoch": 0.9612903225806452, + "grad_norm": 1.8055313581491765, + "learning_rate": 3.9244518691078925e-08, + "loss": 0.2715, + "step": 6854 + }, + { + "epoch": 0.9614305750350631, + "grad_norm": 2.114058318207997, + "learning_rate": 3.8961022537149505e-08, + "loss": 0.3114, + "step": 6855 + }, + { + "epoch": 0.9615708274894811, + "grad_norm": 2.245820189692927, + "learning_rate": 3.86785500589526e-08, + "loss": 0.3534, + "step": 6856 + }, + { + "epoch": 0.961711079943899, + "grad_norm": 2.5238872871260427, + "learning_rate": 3.839710131477492e-08, + "loss": 0.3688, + "step": 6857 + }, + { + "epoch": 0.961851332398317, + "grad_norm": 1.7384740829664376, + "learning_rate": 3.811667636269001e-08, + "loss": 0.3305, + "step": 6858 + }, + { + "epoch": 0.961991584852735, + "grad_norm": 1.9439769281226988, + "learning_rate": 3.7837275260563244e-08, + "loss": 0.3543, + "step": 6859 + }, + { + "epoch": 0.9621318373071529, + "grad_norm": 2.030804277080956, + "learning_rate": 3.755889806604629e-08, + "loss": 0.3141, + "step": 6860 + }, + { + "epoch": 0.9622720897615709, + "grad_norm": 2.164263113480276, + "learning_rate": 3.728154483657986e-08, + "loss": 0.3333, + "step": 6861 + }, + { + "epoch": 0.9624123422159888, + "grad_norm": 2.285130792459692, + "learning_rate": 3.700521562939485e-08, + "loss": 0.3338, + "step": 6862 + }, + { + "epoch": 0.9625525946704068, + "grad_norm": 1.7627436665589045, + "learning_rate": 3.672991050150898e-08, + "loss": 0.3134, + "step": 6863 + }, + { + "epoch": 0.9626928471248247, + "grad_norm": 2.022295655441503, + "learning_rate": 3.645562950973014e-08, + "loss": 0.3168, + "step": 6864 + }, + { + "epoch": 0.9628330995792427, + "grad_norm": 2.4641930738454705, + "learning_rate": 3.618237271065417e-08, + "loss": 0.3211, + "step": 6865 + }, + { + "epoch": 0.9629733520336606, + "grad_norm": 2.177571462516267, + "learning_rate": 3.591014016066541e-08, + "loss": 0.3516, + "step": 6866 + }, + { + "epoch": 0.9631136044880786, + "grad_norm": 1.61777644098987, + "learning_rate": 3.563893191593726e-08, + "loss": 0.3126, + "step": 6867 + }, + { + "epoch": 0.9632538569424965, + "grad_norm": 1.372134430496963, + "learning_rate": 3.5368748032431624e-08, + "loss": 0.2821, + "step": 6868 + }, + { + "epoch": 0.9633941093969145, + "grad_norm": 2.1865836267675984, + "learning_rate": 3.509958856590001e-08, + "loss": 0.3502, + "step": 6869 + }, + { + "epoch": 0.9635343618513323, + "grad_norm": 1.674688307385927, + "learning_rate": 3.483145357187967e-08, + "loss": 0.3405, + "step": 6870 + }, + { + "epoch": 0.9636746143057503, + "grad_norm": 4.775680589356111, + "learning_rate": 3.456434310570023e-08, + "loss": 0.3143, + "step": 6871 + }, + { + "epoch": 0.9638148667601683, + "grad_norm": 1.509979509252502, + "learning_rate": 3.429825722247704e-08, + "loss": 0.3187, + "step": 6872 + }, + { + "epoch": 0.9639551192145862, + "grad_norm": 1.7459413602754554, + "learning_rate": 3.403319597711563e-08, + "loss": 0.3514, + "step": 6873 + }, + { + "epoch": 0.9640953716690042, + "grad_norm": 2.086018192859291, + "learning_rate": 3.3769159424308917e-08, + "loss": 0.3305, + "step": 6874 + }, + { + "epoch": 0.9642356241234221, + "grad_norm": 1.8846004117194253, + "learning_rate": 3.3506147618538874e-08, + "loss": 0.3638, + "step": 6875 + }, + { + "epoch": 0.9643758765778401, + "grad_norm": 2.634913028244453, + "learning_rate": 3.324416061407709e-08, + "loss": 0.3372, + "step": 6876 + }, + { + "epoch": 0.964516129032258, + "grad_norm": 2.1519258789287607, + "learning_rate": 3.298319846498254e-08, + "loss": 0.3397, + "step": 6877 + }, + { + "epoch": 0.964656381486676, + "grad_norm": 1.7700900443928804, + "learning_rate": 3.2723261225102164e-08, + "loss": 0.3803, + "step": 6878 + }, + { + "epoch": 0.9647966339410939, + "grad_norm": 2.3261941160841126, + "learning_rate": 3.246434894807304e-08, + "loss": 0.335, + "step": 6879 + }, + { + "epoch": 0.9649368863955119, + "grad_norm": 2.317850161438424, + "learning_rate": 3.2206461687319666e-08, + "loss": 0.2882, + "step": 6880 + }, + { + "epoch": 0.9650771388499298, + "grad_norm": 1.9930923501346247, + "learning_rate": 3.1949599496054475e-08, + "loss": 0.3407, + "step": 6881 + }, + { + "epoch": 0.9652173913043478, + "grad_norm": 2.071194097562228, + "learning_rate": 3.169376242728062e-08, + "loss": 0.3636, + "step": 6882 + }, + { + "epoch": 0.9653576437587658, + "grad_norm": 2.075505606397217, + "learning_rate": 3.143895053378698e-08, + "loss": 0.2992, + "step": 6883 + }, + { + "epoch": 0.9654978962131837, + "grad_norm": 1.908366919587051, + "learning_rate": 3.118516386815318e-08, + "loss": 0.3265, + "step": 6884 + }, + { + "epoch": 0.9656381486676017, + "grad_norm": 2.0316261530094657, + "learning_rate": 3.093240248274565e-08, + "loss": 0.3436, + "step": 6885 + }, + { + "epoch": 0.9657784011220196, + "grad_norm": 1.9161664959753528, + "learning_rate": 3.068066642972045e-08, + "loss": 0.3113, + "step": 6886 + }, + { + "epoch": 0.9659186535764376, + "grad_norm": 1.8475174640041743, + "learning_rate": 3.042995576102104e-08, + "loss": 0.3669, + "step": 6887 + }, + { + "epoch": 0.9660589060308555, + "grad_norm": 3.1987404192072564, + "learning_rate": 3.018027052838046e-08, + "loss": 0.3377, + "step": 6888 + }, + { + "epoch": 0.9661991584852735, + "grad_norm": 1.7068394044665627, + "learning_rate": 2.993161078331919e-08, + "loss": 0.323, + "step": 6889 + }, + { + "epoch": 0.9663394109396914, + "grad_norm": 2.2114007147818984, + "learning_rate": 2.9683976577146166e-08, + "loss": 0.3307, + "step": 6890 + }, + { + "epoch": 0.9664796633941094, + "grad_norm": 2.061199471783769, + "learning_rate": 2.9437367960959417e-08, + "loss": 0.3147, + "step": 6891 + }, + { + "epoch": 0.9666199158485274, + "grad_norm": 2.848375352609453, + "learning_rate": 2.9191784985644345e-08, + "loss": 0.3783, + "step": 6892 + }, + { + "epoch": 0.9667601683029453, + "grad_norm": 1.8116456353032202, + "learning_rate": 2.894722770187597e-08, + "loss": 0.3364, + "step": 6893 + }, + { + "epoch": 0.9669004207573633, + "grad_norm": 1.7995207174962906, + "learning_rate": 2.8703696160116146e-08, + "loss": 0.3783, + "step": 6894 + }, + { + "epoch": 0.9670406732117812, + "grad_norm": 3.52820949132285, + "learning_rate": 2.8461190410616347e-08, + "loss": 0.3397, + "step": 6895 + }, + { + "epoch": 0.9671809256661992, + "grad_norm": 1.8297594377170325, + "learning_rate": 2.8219710503416543e-08, + "loss": 0.349, + "step": 6896 + }, + { + "epoch": 0.9673211781206171, + "grad_norm": 1.8074482200455293, + "learning_rate": 2.7979256488343542e-08, + "loss": 0.3322, + "step": 6897 + }, + { + "epoch": 0.9674614305750351, + "grad_norm": 1.6312315373453812, + "learning_rate": 2.773982841501377e-08, + "loss": 0.2954, + "step": 6898 + }, + { + "epoch": 0.967601683029453, + "grad_norm": 3.3095476921346325, + "learning_rate": 2.7501426332831594e-08, + "loss": 0.3172, + "step": 6899 + }, + { + "epoch": 0.967741935483871, + "grad_norm": 1.6852966335008814, + "learning_rate": 2.726405029098933e-08, + "loss": 0.309, + "step": 6900 + }, + { + "epoch": 0.967882187938289, + "grad_norm": 1.7513922874921226, + "learning_rate": 2.70277003384678e-08, + "loss": 0.3215, + "step": 6901 + }, + { + "epoch": 0.9680224403927069, + "grad_norm": 1.7986701291227465, + "learning_rate": 2.6792376524036878e-08, + "loss": 0.2972, + "step": 6902 + }, + { + "epoch": 0.9681626928471249, + "grad_norm": 1.6948014996631606, + "learning_rate": 2.6558078896252725e-08, + "loss": 0.3355, + "step": 6903 + }, + { + "epoch": 0.9683029453015428, + "grad_norm": 1.9089437119391899, + "learning_rate": 2.6324807503462223e-08, + "loss": 0.3451, + "step": 6904 + }, + { + "epoch": 0.9684431977559608, + "grad_norm": 1.9048835985159995, + "learning_rate": 2.6092562393799094e-08, + "loss": 0.2934, + "step": 6905 + }, + { + "epoch": 0.9685834502103787, + "grad_norm": 1.4797856105999352, + "learning_rate": 2.5861343615184997e-08, + "loss": 0.3346, + "step": 6906 + }, + { + "epoch": 0.9687237026647967, + "grad_norm": 2.089001409169512, + "learning_rate": 2.5631151215330107e-08, + "loss": 0.3516, + "step": 6907 + }, + { + "epoch": 0.9688639551192146, + "grad_norm": 2.568520799792851, + "learning_rate": 2.5401985241734207e-08, + "loss": 0.321, + "step": 6908 + }, + { + "epoch": 0.9690042075736326, + "grad_norm": 2.6566199549928315, + "learning_rate": 2.5173845741682802e-08, + "loss": 0.2762, + "step": 6909 + }, + { + "epoch": 0.9691444600280504, + "grad_norm": 1.769372928770437, + "learning_rate": 2.4946732762252125e-08, + "loss": 0.3486, + "step": 6910 + }, + { + "epoch": 0.9692847124824684, + "grad_norm": 1.7618534011301776, + "learning_rate": 2.4720646350304134e-08, + "loss": 0.3407, + "step": 6911 + }, + { + "epoch": 0.9694249649368863, + "grad_norm": 2.016468078776808, + "learning_rate": 2.4495586552490958e-08, + "loss": 0.3668, + "step": 6912 + }, + { + "epoch": 0.9695652173913043, + "grad_norm": 2.3641300411119186, + "learning_rate": 2.427155341525156e-08, + "loss": 0.3168, + "step": 6913 + }, + { + "epoch": 0.9697054698457223, + "grad_norm": 1.8476393458941975, + "learning_rate": 2.4048546984813957e-08, + "loss": 0.3249, + "step": 6914 + }, + { + "epoch": 0.9698457223001402, + "grad_norm": 2.1510117523354686, + "learning_rate": 2.3826567307194127e-08, + "loss": 0.327, + "step": 6915 + }, + { + "epoch": 0.9699859747545582, + "grad_norm": 1.711104223759044, + "learning_rate": 2.360561442819598e-08, + "loss": 0.3117, + "step": 6916 + }, + { + "epoch": 0.9701262272089761, + "grad_norm": 1.9177661174941776, + "learning_rate": 2.338568839341082e-08, + "loss": 0.3283, + "step": 6917 + }, + { + "epoch": 0.9702664796633941, + "grad_norm": 2.1546403709607382, + "learning_rate": 2.3166789248220134e-08, + "loss": 0.3457, + "step": 6918 + }, + { + "epoch": 0.970406732117812, + "grad_norm": 1.6086490067657344, + "learning_rate": 2.294891703779112e-08, + "loss": 0.335, + "step": 6919 + }, + { + "epoch": 0.97054698457223, + "grad_norm": 2.495590135276828, + "learning_rate": 2.2732071807081147e-08, + "loss": 0.3171, + "step": 6920 + }, + { + "epoch": 0.9706872370266479, + "grad_norm": 2.228933256769876, + "learning_rate": 2.251625360083387e-08, + "loss": 0.3326, + "step": 6921 + }, + { + "epoch": 0.9708274894810659, + "grad_norm": 1.739643720237338, + "learning_rate": 2.230146246358256e-08, + "loss": 0.3429, + "step": 6922 + }, + { + "epoch": 0.9709677419354839, + "grad_norm": 2.144460986155397, + "learning_rate": 2.2087698439646756e-08, + "loss": 0.3543, + "step": 6923 + }, + { + "epoch": 0.9711079943899018, + "grad_norm": 1.7617003446239223, + "learning_rate": 2.1874961573136734e-08, + "loss": 0.3433, + "step": 6924 + }, + { + "epoch": 0.9712482468443198, + "grad_norm": 2.106882388662975, + "learning_rate": 2.1663251907947935e-08, + "loss": 0.3186, + "step": 6925 + }, + { + "epoch": 0.9713884992987377, + "grad_norm": 2.9334674508327536, + "learning_rate": 2.1452569487765973e-08, + "loss": 0.3249, + "step": 6926 + }, + { + "epoch": 0.9715287517531557, + "grad_norm": 1.9021588809444159, + "learning_rate": 2.1242914356063292e-08, + "loss": 0.3506, + "step": 6927 + }, + { + "epoch": 0.9716690042075736, + "grad_norm": 2.178758867763347, + "learning_rate": 2.1034286556100847e-08, + "loss": 0.3197, + "step": 6928 + }, + { + "epoch": 0.9718092566619916, + "grad_norm": 1.7642679316174432, + "learning_rate": 2.082668613092753e-08, + "loss": 0.3546, + "step": 6929 + }, + { + "epoch": 0.9719495091164095, + "grad_norm": 2.024741205703396, + "learning_rate": 2.0620113123380746e-08, + "loss": 0.2879, + "step": 6930 + }, + { + "epoch": 0.9720897615708275, + "grad_norm": 2.2417654244981438, + "learning_rate": 2.0414567576084176e-08, + "loss": 0.3765, + "step": 6931 + }, + { + "epoch": 0.9722300140252454, + "grad_norm": 1.6519353555773142, + "learning_rate": 2.021004953145167e-08, + "loss": 0.2957, + "step": 6932 + }, + { + "epoch": 0.9723702664796634, + "grad_norm": 2.480118950636816, + "learning_rate": 2.000655903168447e-08, + "loss": 0.3146, + "step": 6933 + }, + { + "epoch": 0.9725105189340814, + "grad_norm": 1.548173147784194, + "learning_rate": 1.98040961187701e-08, + "loss": 0.3555, + "step": 6934 + }, + { + "epoch": 0.9726507713884993, + "grad_norm": 3.8052676030821924, + "learning_rate": 1.9602660834486253e-08, + "loss": 0.3122, + "step": 6935 + }, + { + "epoch": 0.9727910238429173, + "grad_norm": 2.0996884069428017, + "learning_rate": 1.9402253220398014e-08, + "loss": 0.3338, + "step": 6936 + }, + { + "epoch": 0.9729312762973352, + "grad_norm": 3.0433162936359985, + "learning_rate": 1.9202873317856752e-08, + "loss": 0.3611, + "step": 6937 + }, + { + "epoch": 0.9730715287517532, + "grad_norm": 1.726733445210379, + "learning_rate": 1.900452116800455e-08, + "loss": 0.3212, + "step": 6938 + }, + { + "epoch": 0.9732117812061711, + "grad_norm": 2.168199791496611, + "learning_rate": 1.8807196811769236e-08, + "loss": 0.345, + "step": 6939 + }, + { + "epoch": 0.9733520336605891, + "grad_norm": 2.0695186604526943, + "learning_rate": 1.8610900289867673e-08, + "loss": 0.3747, + "step": 6940 + }, + { + "epoch": 0.973492286115007, + "grad_norm": 2.525995900372027, + "learning_rate": 1.841563164280413e-08, + "loss": 0.3202, + "step": 6941 + }, + { + "epoch": 0.973632538569425, + "grad_norm": 2.2008566575678308, + "learning_rate": 1.822139091087083e-08, + "loss": 0.3425, + "step": 6942 + }, + { + "epoch": 0.973772791023843, + "grad_norm": 1.9939880611632366, + "learning_rate": 1.802817813414792e-08, + "loss": 0.301, + "step": 6943 + }, + { + "epoch": 0.9739130434782609, + "grad_norm": 2.395165812062486, + "learning_rate": 1.7835993352503524e-08, + "loss": 0.3518, + "step": 6944 + }, + { + "epoch": 0.9740532959326789, + "grad_norm": 1.9435400797345095, + "learning_rate": 1.764483660559424e-08, + "loss": 0.3246, + "step": 6945 + }, + { + "epoch": 0.9741935483870968, + "grad_norm": 1.6021597888232368, + "learning_rate": 1.745470793286297e-08, + "loss": 0.3261, + "step": 6946 + }, + { + "epoch": 0.9743338008415148, + "grad_norm": 1.76142248128104, + "learning_rate": 1.726560737354166e-08, + "loss": 0.3349, + "step": 6947 + }, + { + "epoch": 0.9744740532959327, + "grad_norm": 2.1330115815754516, + "learning_rate": 1.7077534966650767e-08, + "loss": 0.3714, + "step": 6948 + }, + { + "epoch": 0.9746143057503507, + "grad_norm": 2.1109746448289806, + "learning_rate": 1.6890490750997025e-08, + "loss": 0.3255, + "step": 6949 + }, + { + "epoch": 0.9747545582047685, + "grad_norm": 2.1459100456475872, + "learning_rate": 1.6704474765175115e-08, + "loss": 0.3629, + "step": 6950 + }, + { + "epoch": 0.9748948106591865, + "grad_norm": 1.7336263329502652, + "learning_rate": 1.6519487047569338e-08, + "loss": 0.3143, + "step": 6951 + }, + { + "epoch": 0.9750350631136044, + "grad_norm": 1.8197520952848103, + "learning_rate": 1.6335527636350267e-08, + "loss": 0.3112, + "step": 6952 + }, + { + "epoch": 0.9751753155680224, + "grad_norm": 2.1264879963399466, + "learning_rate": 1.6152596569475877e-08, + "loss": 0.3272, + "step": 6953 + }, + { + "epoch": 0.9753155680224403, + "grad_norm": 2.5653568253118935, + "learning_rate": 1.5970693884693745e-08, + "loss": 0.3567, + "step": 6954 + }, + { + "epoch": 0.9754558204768583, + "grad_norm": 1.9539150013626614, + "learning_rate": 1.5789819619537182e-08, + "loss": 0.331, + "step": 6955 + }, + { + "epoch": 0.9755960729312763, + "grad_norm": 1.639369593272163, + "learning_rate": 1.5609973811329116e-08, + "loss": 0.3282, + "step": 6956 + }, + { + "epoch": 0.9757363253856942, + "grad_norm": 2.0810067982670297, + "learning_rate": 1.5431156497179856e-08, + "loss": 0.3238, + "step": 6957 + }, + { + "epoch": 0.9758765778401122, + "grad_norm": 2.2223468692577684, + "learning_rate": 1.5253367713985444e-08, + "loss": 0.355, + "step": 6958 + }, + { + "epoch": 0.9760168302945301, + "grad_norm": 2.022645183958781, + "learning_rate": 1.5076607498433203e-08, + "loss": 0.3448, + "step": 6959 + }, + { + "epoch": 0.9761570827489481, + "grad_norm": 2.1896754755098566, + "learning_rate": 1.490087588699507e-08, + "loss": 0.3126, + "step": 6960 + }, + { + "epoch": 0.976297335203366, + "grad_norm": 2.0119698673289803, + "learning_rate": 1.4726172915933146e-08, + "loss": 0.3229, + "step": 6961 + }, + { + "epoch": 0.976437587657784, + "grad_norm": 1.5223322062116849, + "learning_rate": 1.4552498621295264e-08, + "loss": 0.3171, + "step": 6962 + }, + { + "epoch": 0.976577840112202, + "grad_norm": 2.180829258430162, + "learning_rate": 1.4379853038917757e-08, + "loss": 0.3321, + "step": 6963 + }, + { + "epoch": 0.9767180925666199, + "grad_norm": 1.7928534576647364, + "learning_rate": 1.4208236204426018e-08, + "loss": 0.3371, + "step": 6964 + }, + { + "epoch": 0.9768583450210379, + "grad_norm": 1.9880872230814195, + "learning_rate": 1.403764815323061e-08, + "loss": 0.2968, + "step": 6965 + }, + { + "epoch": 0.9769985974754558, + "grad_norm": 3.1878461462824963, + "learning_rate": 1.3868088920532263e-08, + "loss": 0.294, + "step": 6966 + }, + { + "epoch": 0.9771388499298738, + "grad_norm": 1.9792679677101683, + "learning_rate": 1.3699558541317437e-08, + "loss": 0.3215, + "step": 6967 + }, + { + "epoch": 0.9772791023842917, + "grad_norm": 2.201612345535296, + "learning_rate": 1.3532057050361646e-08, + "loss": 0.4144, + "step": 6968 + }, + { + "epoch": 0.9774193548387097, + "grad_norm": 3.0719044370589335, + "learning_rate": 1.3365584482228356e-08, + "loss": 0.3379, + "step": 6969 + }, + { + "epoch": 0.9775596072931276, + "grad_norm": 1.7739490325857143, + "learning_rate": 1.3200140871266754e-08, + "loss": 0.3445, + "step": 6970 + }, + { + "epoch": 0.9776998597475456, + "grad_norm": 1.971182637245946, + "learning_rate": 1.3035726251615644e-08, + "loss": 0.3734, + "step": 6971 + }, + { + "epoch": 0.9778401122019635, + "grad_norm": 2.864127290433025, + "learning_rate": 1.2872340657200666e-08, + "loss": 0.3259, + "step": 6972 + }, + { + "epoch": 0.9779803646563815, + "grad_norm": 2.7583103324953018, + "learning_rate": 1.2709984121735407e-08, + "loss": 0.3531, + "step": 6973 + }, + { + "epoch": 0.9781206171107995, + "grad_norm": 1.6661653082487407, + "learning_rate": 1.2548656678721404e-08, + "loss": 0.2938, + "step": 6974 + }, + { + "epoch": 0.9782608695652174, + "grad_norm": 1.6411081238529122, + "learning_rate": 1.2388358361446473e-08, + "loss": 0.3215, + "step": 6975 + }, + { + "epoch": 0.9784011220196354, + "grad_norm": 1.8096072691897056, + "learning_rate": 1.2229089202987487e-08, + "loss": 0.3257, + "step": 6976 + }, + { + "epoch": 0.9785413744740533, + "grad_norm": 1.7345780262379857, + "learning_rate": 1.2070849236208716e-08, + "loss": 0.361, + "step": 6977 + }, + { + "epoch": 0.9786816269284713, + "grad_norm": 2.003078832704314, + "learning_rate": 1.1913638493762369e-08, + "loss": 0.3515, + "step": 6978 + }, + { + "epoch": 0.9788218793828892, + "grad_norm": 2.1185393012054208, + "learning_rate": 1.1757457008086393e-08, + "loss": 0.3649, + "step": 6979 + }, + { + "epoch": 0.9789621318373072, + "grad_norm": 2.318011237649609, + "learning_rate": 1.1602304811408893e-08, + "loss": 0.3725, + "step": 6980 + }, + { + "epoch": 0.9791023842917251, + "grad_norm": 2.4755974944256867, + "learning_rate": 1.1448181935744262e-08, + "loss": 0.3353, + "step": 6981 + }, + { + "epoch": 0.9792426367461431, + "grad_norm": 1.932705021848148, + "learning_rate": 1.1295088412894285e-08, + "loss": 0.3427, + "step": 6982 + }, + { + "epoch": 0.979382889200561, + "grad_norm": 1.843193616808991, + "learning_rate": 1.1143024274448689e-08, + "loss": 0.3462, + "step": 6983 + }, + { + "epoch": 0.979523141654979, + "grad_norm": 1.7363907381145973, + "learning_rate": 1.0991989551785708e-08, + "loss": 0.3384, + "step": 6984 + }, + { + "epoch": 0.979663394109397, + "grad_norm": 1.7355447209782942, + "learning_rate": 1.0841984276069306e-08, + "loss": 0.3551, + "step": 6985 + }, + { + "epoch": 0.9798036465638149, + "grad_norm": 2.5687334491725586, + "learning_rate": 1.0693008478252498e-08, + "loss": 0.3355, + "step": 6986 + }, + { + "epoch": 0.9799438990182329, + "grad_norm": 2.0444300368856387, + "learning_rate": 1.0545062189075139e-08, + "loss": 0.3522, + "step": 6987 + }, + { + "epoch": 0.9800841514726508, + "grad_norm": 1.888648142463353, + "learning_rate": 1.0398145439065588e-08, + "loss": 0.3266, + "step": 6988 + }, + { + "epoch": 0.9802244039270688, + "grad_norm": 2.2163758806602636, + "learning_rate": 1.0252258258537929e-08, + "loss": 0.3318, + "step": 6989 + }, + { + "epoch": 0.9803646563814866, + "grad_norm": 1.4781148510961541, + "learning_rate": 1.0107400677596413e-08, + "loss": 0.3236, + "step": 6990 + }, + { + "epoch": 0.9805049088359046, + "grad_norm": 1.8701176974542386, + "learning_rate": 9.963572726129911e-09, + "loss": 0.3438, + "step": 6991 + }, + { + "epoch": 0.9806451612903225, + "grad_norm": 2.3022615757863583, + "learning_rate": 9.82077443381746e-09, + "loss": 0.3299, + "step": 6992 + }, + { + "epoch": 0.9807854137447405, + "grad_norm": 1.9215475904500783, + "learning_rate": 9.679005830124376e-09, + "loss": 0.3164, + "step": 6993 + }, + { + "epoch": 0.9809256661991584, + "grad_norm": 2.354190713765256, + "learning_rate": 9.53826694430282e-09, + "loss": 0.3163, + "step": 6994 + }, + { + "epoch": 0.9810659186535764, + "grad_norm": 1.8962011155206373, + "learning_rate": 9.398557805394003e-09, + "loss": 0.3212, + "step": 6995 + }, + { + "epoch": 0.9812061711079944, + "grad_norm": 1.737051333673081, + "learning_rate": 9.259878442225422e-09, + "loss": 0.2991, + "step": 6996 + }, + { + "epoch": 0.9813464235624123, + "grad_norm": 2.0739370036181923, + "learning_rate": 9.12222888341252e-09, + "loss": 0.337, + "step": 6997 + }, + { + "epoch": 0.9814866760168303, + "grad_norm": 3.0198056983336654, + "learning_rate": 8.985609157359243e-09, + "loss": 0.3078, + "step": 6998 + }, + { + "epoch": 0.9816269284712482, + "grad_norm": 2.4857989375102427, + "learning_rate": 8.850019292255263e-09, + "loss": 0.3402, + "step": 6999 + }, + { + "epoch": 0.9817671809256662, + "grad_norm": 2.239733160167448, + "learning_rate": 8.715459316078756e-09, + "loss": 0.3638, + "step": 7000 + }, + { + "epoch": 0.9819074333800841, + "grad_norm": 1.844160260607244, + "learning_rate": 8.581929256595844e-09, + "loss": 0.3495, + "step": 7001 + }, + { + "epoch": 0.9820476858345021, + "grad_norm": 1.7637742732031654, + "learning_rate": 8.449429141358378e-09, + "loss": 0.3513, + "step": 7002 + }, + { + "epoch": 0.98218793828892, + "grad_norm": 1.8230067276278605, + "learning_rate": 8.317958997708374e-09, + "loss": 0.3179, + "step": 7003 + }, + { + "epoch": 0.982328190743338, + "grad_norm": 3.797825195974302, + "learning_rate": 8.187518852771914e-09, + "loss": 0.3378, + "step": 7004 + }, + { + "epoch": 0.982468443197756, + "grad_norm": 2.6197214186687647, + "learning_rate": 8.058108733465797e-09, + "loss": 0.3196, + "step": 7005 + }, + { + "epoch": 0.9826086956521739, + "grad_norm": 2.2768127974066763, + "learning_rate": 7.929728666492553e-09, + "loss": 0.3388, + "step": 7006 + }, + { + "epoch": 0.9827489481065919, + "grad_norm": 2.1874182549597934, + "learning_rate": 7.802378678342105e-09, + "loss": 0.3082, + "step": 7007 + }, + { + "epoch": 0.9828892005610098, + "grad_norm": 1.8473112336641073, + "learning_rate": 7.676058795292873e-09, + "loss": 0.2986, + "step": 7008 + }, + { + "epoch": 0.9830294530154278, + "grad_norm": 1.7340333619301749, + "learning_rate": 7.550769043409567e-09, + "loss": 0.3024, + "step": 7009 + }, + { + "epoch": 0.9831697054698457, + "grad_norm": 2.7785773133276166, + "learning_rate": 7.426509448545394e-09, + "loss": 0.2922, + "step": 7010 + }, + { + "epoch": 0.9833099579242637, + "grad_norm": 1.9353620977959036, + "learning_rate": 7.3032800363398435e-09, + "loss": 0.3093, + "step": 7011 + }, + { + "epoch": 0.9834502103786816, + "grad_norm": 1.8281876642947563, + "learning_rate": 7.18108083222091e-09, + "loss": 0.3305, + "step": 7012 + }, + { + "epoch": 0.9835904628330996, + "grad_norm": 1.8850985376319274, + "learning_rate": 7.0599118614034235e-09, + "loss": 0.3445, + "step": 7013 + }, + { + "epoch": 0.9837307152875175, + "grad_norm": 2.069942340508495, + "learning_rate": 6.939773148889051e-09, + "loss": 0.3376, + "step": 7014 + }, + { + "epoch": 0.9838709677419355, + "grad_norm": 2.1912792801410412, + "learning_rate": 6.820664719469072e-09, + "loss": 0.3395, + "step": 7015 + }, + { + "epoch": 0.9840112201963535, + "grad_norm": 2.1815345075817105, + "learning_rate": 6.702586597719385e-09, + "loss": 0.3542, + "step": 7016 + }, + { + "epoch": 0.9841514726507714, + "grad_norm": 1.9681281061229745, + "learning_rate": 6.585538808004943e-09, + "loss": 0.3164, + "step": 7017 + }, + { + "epoch": 0.9842917251051894, + "grad_norm": 2.2809307972445914, + "learning_rate": 6.469521374477539e-09, + "loss": 0.3031, + "step": 7018 + }, + { + "epoch": 0.9844319775596073, + "grad_norm": 2.81271598834278, + "learning_rate": 6.354534321077465e-09, + "loss": 0.3586, + "step": 7019 + }, + { + "epoch": 0.9845722300140253, + "grad_norm": 1.766811343632893, + "learning_rate": 6.24057767153019e-09, + "loss": 0.3094, + "step": 7020 + }, + { + "epoch": 0.9847124824684432, + "grad_norm": 1.9827992347892807, + "learning_rate": 6.1276514493513466e-09, + "loss": 0.3841, + "step": 7021 + }, + { + "epoch": 0.9848527349228612, + "grad_norm": 2.0834922602752113, + "learning_rate": 6.0157556778411844e-09, + "loss": 0.3589, + "step": 7022 + }, + { + "epoch": 0.9849929873772791, + "grad_norm": 1.6604575927106815, + "learning_rate": 5.904890380089012e-09, + "loss": 0.3535, + "step": 7023 + }, + { + "epoch": 0.9851332398316971, + "grad_norm": 1.849249638636689, + "learning_rate": 5.795055578971531e-09, + "loss": 0.2979, + "step": 7024 + }, + { + "epoch": 0.985273492286115, + "grad_norm": 2.1086277521196757, + "learning_rate": 5.686251297151724e-09, + "loss": 0.3452, + "step": 7025 + }, + { + "epoch": 0.985413744740533, + "grad_norm": 2.017633191785424, + "learning_rate": 5.578477557081074e-09, + "loss": 0.3144, + "step": 7026 + }, + { + "epoch": 0.985553997194951, + "grad_norm": 1.7342644046197875, + "learning_rate": 5.471734380997906e-09, + "loss": 0.3361, + "step": 7027 + }, + { + "epoch": 0.9856942496493689, + "grad_norm": 1.4852054882766985, + "learning_rate": 5.366021790927378e-09, + "loss": 0.3106, + "step": 7028 + }, + { + "epoch": 0.9858345021037869, + "grad_norm": 2.3414887773634625, + "learning_rate": 5.261339808683707e-09, + "loss": 0.3364, + "step": 7029 + }, + { + "epoch": 0.9859747545582047, + "grad_norm": 1.5287347493693741, + "learning_rate": 5.157688455865728e-09, + "loss": 0.3157, + "step": 7030 + }, + { + "epoch": 0.9861150070126227, + "grad_norm": 1.6782467362843645, + "learning_rate": 5.055067753862442e-09, + "loss": 0.3145, + "step": 7031 + }, + { + "epoch": 0.9862552594670406, + "grad_norm": 2.2836597204711975, + "learning_rate": 4.9534777238485764e-09, + "loss": 0.3618, + "step": 7032 + }, + { + "epoch": 0.9863955119214586, + "grad_norm": 2.2912733838708577, + "learning_rate": 4.852918386786254e-09, + "loss": 0.3465, + "step": 7033 + }, + { + "epoch": 0.9865357643758765, + "grad_norm": 2.123225873965707, + "learning_rate": 4.753389763425542e-09, + "loss": 0.3562, + "step": 7034 + }, + { + "epoch": 0.9866760168302945, + "grad_norm": 2.6014407934181696, + "learning_rate": 4.654891874303347e-09, + "loss": 0.2982, + "step": 7035 + }, + { + "epoch": 0.9868162692847124, + "grad_norm": 2.0661023210297103, + "learning_rate": 4.55742473974341e-09, + "loss": 0.324, + "step": 7036 + }, + { + "epoch": 0.9869565217391304, + "grad_norm": 3.987655665309898, + "learning_rate": 4.460988379858533e-09, + "loss": 0.3653, + "step": 7037 + }, + { + "epoch": 0.9870967741935484, + "grad_norm": 2.324958747579836, + "learning_rate": 4.365582814546687e-09, + "loss": 0.315, + "step": 7038 + }, + { + "epoch": 0.9872370266479663, + "grad_norm": 2.3145150349380637, + "learning_rate": 4.2712080634949024e-09, + "loss": 0.3015, + "step": 7039 + }, + { + "epoch": 0.9873772791023843, + "grad_norm": 3.197096795137812, + "learning_rate": 4.1778641461764916e-09, + "loss": 0.3249, + "step": 7040 + }, + { + "epoch": 0.9875175315568022, + "grad_norm": 1.724405866390223, + "learning_rate": 4.085551081851602e-09, + "loss": 0.3078, + "step": 7041 + }, + { + "epoch": 0.9876577840112202, + "grad_norm": 2.0545719244764054, + "learning_rate": 3.994268889569442e-09, + "loss": 0.3362, + "step": 7042 + }, + { + "epoch": 0.9877980364656381, + "grad_norm": 3.784429553242738, + "learning_rate": 3.904017588164943e-09, + "loss": 0.3165, + "step": 7043 + }, + { + "epoch": 0.9879382889200561, + "grad_norm": 3.121939419224, + "learning_rate": 3.814797196261544e-09, + "loss": 0.3187, + "step": 7044 + }, + { + "epoch": 0.988078541374474, + "grad_norm": 1.918127671007153, + "learning_rate": 3.726607732267851e-09, + "loss": 0.317, + "step": 7045 + }, + { + "epoch": 0.988218793828892, + "grad_norm": 2.122370495479186, + "learning_rate": 3.6394492143820847e-09, + "loss": 0.4136, + "step": 7046 + }, + { + "epoch": 0.98835904628331, + "grad_norm": 1.8414429924349593, + "learning_rate": 3.553321660588749e-09, + "loss": 0.2722, + "step": 7047 + }, + { + "epoch": 0.9884992987377279, + "grad_norm": 1.6685789184882778, + "learning_rate": 3.468225088659738e-09, + "loss": 0.3365, + "step": 7048 + }, + { + "epoch": 0.9886395511921459, + "grad_norm": 2.0828852235155924, + "learning_rate": 3.3841595161537842e-09, + "loss": 0.3409, + "step": 7049 + }, + { + "epoch": 0.9887798036465638, + "grad_norm": 2.35572643989733, + "learning_rate": 3.3011249604170124e-09, + "loss": 0.3671, + "step": 7050 + }, + { + "epoch": 0.9889200561009818, + "grad_norm": 1.8969862300431353, + "learning_rate": 3.2191214385840498e-09, + "loss": 0.3206, + "step": 7051 + }, + { + "epoch": 0.9890603085553997, + "grad_norm": 1.9554634438601601, + "learning_rate": 3.1381489675746946e-09, + "loss": 0.3491, + "step": 7052 + }, + { + "epoch": 0.9892005610098177, + "grad_norm": 1.6366118985715643, + "learning_rate": 3.0582075640972487e-09, + "loss": 0.3415, + "step": 7053 + }, + { + "epoch": 0.9893408134642356, + "grad_norm": 2.0319114195071677, + "learning_rate": 2.9792972446479605e-09, + "loss": 0.3512, + "step": 7054 + }, + { + "epoch": 0.9894810659186536, + "grad_norm": 3.765417955966864, + "learning_rate": 2.9014180255082515e-09, + "loss": 0.3231, + "step": 7055 + }, + { + "epoch": 0.9896213183730715, + "grad_norm": 2.068452741120409, + "learning_rate": 2.824569922748599e-09, + "loss": 0.2907, + "step": 7056 + }, + { + "epoch": 0.9897615708274895, + "grad_norm": 2.6674607604554064, + "learning_rate": 2.7487529522257637e-09, + "loss": 0.3414, + "step": 7057 + }, + { + "epoch": 0.9899018232819075, + "grad_norm": 2.4476430104199265, + "learning_rate": 2.6739671295838986e-09, + "loss": 0.2938, + "step": 7058 + }, + { + "epoch": 0.9900420757363254, + "grad_norm": 1.8463221566492758, + "learning_rate": 2.6002124702556585e-09, + "loss": 0.3251, + "step": 7059 + }, + { + "epoch": 0.9901823281907434, + "grad_norm": 2.1455342599786627, + "learning_rate": 2.5274889894583156e-09, + "loss": 0.319, + "step": 7060 + }, + { + "epoch": 0.9903225806451613, + "grad_norm": 2.0769030197783556, + "learning_rate": 2.455796702198754e-09, + "loss": 0.3236, + "step": 7061 + }, + { + "epoch": 0.9904628330995793, + "grad_norm": 2.9287772023284573, + "learning_rate": 2.3851356232695855e-09, + "loss": 0.3464, + "step": 7062 + }, + { + "epoch": 0.9906030855539972, + "grad_norm": 3.632268130626663, + "learning_rate": 2.3155057672519244e-09, + "loss": 0.3418, + "step": 7063 + }, + { + "epoch": 0.9907433380084152, + "grad_norm": 2.0404546085273374, + "learning_rate": 2.246907148513167e-09, + "loss": 0.2938, + "step": 7064 + }, + { + "epoch": 0.9908835904628331, + "grad_norm": 2.1601198272669633, + "learning_rate": 2.179339781208101e-09, + "loss": 0.3437, + "step": 7065 + }, + { + "epoch": 0.9910238429172511, + "grad_norm": 2.5819283838600797, + "learning_rate": 2.1128036792783526e-09, + "loss": 0.3113, + "step": 7066 + }, + { + "epoch": 0.991164095371669, + "grad_norm": 2.5702515568798536, + "learning_rate": 2.0472988564540496e-09, + "loss": 0.3398, + "step": 7067 + }, + { + "epoch": 0.991304347826087, + "grad_norm": 2.073958385958656, + "learning_rate": 1.982825326250493e-09, + "loss": 0.3553, + "step": 7068 + }, + { + "epoch": 0.991444600280505, + "grad_norm": 2.0484930733621116, + "learning_rate": 1.919383101972594e-09, + "loss": 0.3446, + "step": 7069 + }, + { + "epoch": 0.9915848527349228, + "grad_norm": 2.0164375737272437, + "learning_rate": 1.856972196710438e-09, + "loss": 0.312, + "step": 7070 + }, + { + "epoch": 0.9917251051893408, + "grad_norm": 2.294530073768904, + "learning_rate": 1.7955926233420573e-09, + "loss": 0.2994, + "step": 7071 + }, + { + "epoch": 0.9918653576437587, + "grad_norm": 1.7331397666811066, + "learning_rate": 1.7352443945334308e-09, + "loss": 0.3197, + "step": 7072 + }, + { + "epoch": 0.9920056100981767, + "grad_norm": 2.1116657392965967, + "learning_rate": 1.6759275227357098e-09, + "loss": 0.35, + "step": 7073 + }, + { + "epoch": 0.9921458625525946, + "grad_norm": 2.626575814417055, + "learning_rate": 1.6176420201902132e-09, + "loss": 0.2825, + "step": 7074 + }, + { + "epoch": 0.9922861150070126, + "grad_norm": 3.197740597594557, + "learning_rate": 1.560387898922322e-09, + "loss": 0.329, + "step": 7075 + }, + { + "epoch": 0.9924263674614305, + "grad_norm": 1.8922180526119, + "learning_rate": 1.5041651707464744e-09, + "loss": 0.3594, + "step": 7076 + }, + { + "epoch": 0.9925666199158485, + "grad_norm": 2.3169031237492144, + "learning_rate": 1.4489738472639459e-09, + "loss": 0.3294, + "step": 7077 + }, + { + "epoch": 0.9927068723702664, + "grad_norm": 1.7995159478483842, + "learning_rate": 1.3948139398628492e-09, + "loss": 0.3397, + "step": 7078 + }, + { + "epoch": 0.9928471248246844, + "grad_norm": 1.9404430187364463, + "learning_rate": 1.3416854597192441e-09, + "loss": 0.3219, + "step": 7079 + }, + { + "epoch": 0.9929873772791024, + "grad_norm": 2.070890641011154, + "learning_rate": 1.2895884177954732e-09, + "loss": 0.3431, + "step": 7080 + }, + { + "epoch": 0.9931276297335203, + "grad_norm": 2.2443074163567185, + "learning_rate": 1.2385228248407155e-09, + "loss": 0.3546, + "step": 7081 + }, + { + "epoch": 0.9932678821879383, + "grad_norm": 1.982793944071344, + "learning_rate": 1.1884886913932082e-09, + "loss": 0.3221, + "step": 7082 + }, + { + "epoch": 0.9934081346423562, + "grad_norm": 2.742405287742413, + "learning_rate": 1.1394860277763597e-09, + "loss": 0.3109, + "step": 7083 + }, + { + "epoch": 0.9935483870967742, + "grad_norm": 2.510864777096434, + "learning_rate": 1.0915148441020817e-09, + "loss": 0.3595, + "step": 7084 + }, + { + "epoch": 0.9936886395511921, + "grad_norm": 1.7686502650957845, + "learning_rate": 1.0445751502685676e-09, + "loss": 0.2833, + "step": 7085 + }, + { + "epoch": 0.9938288920056101, + "grad_norm": 1.913805076260073, + "learning_rate": 9.986669559614027e-10, + "loss": 0.2992, + "step": 7086 + }, + { + "epoch": 0.993969144460028, + "grad_norm": 2.764108878049031, + "learning_rate": 9.537902706535652e-10, + "loss": 0.3131, + "step": 7087 + }, + { + "epoch": 0.994109396914446, + "grad_norm": 1.8324669683791863, + "learning_rate": 9.099451036048701e-10, + "loss": 0.2991, + "step": 7088 + }, + { + "epoch": 0.994249649368864, + "grad_norm": 1.903760566506771, + "learning_rate": 8.67131463862525e-10, + "loss": 0.3324, + "step": 7089 + }, + { + "epoch": 0.9943899018232819, + "grad_norm": 2.18812737323945, + "learning_rate": 8.253493602611295e-10, + "loss": 0.3727, + "step": 7090 + }, + { + "epoch": 0.9945301542776999, + "grad_norm": 2.7696960141949445, + "learning_rate": 7.845988014215655e-10, + "loss": 0.2828, + "step": 7091 + }, + { + "epoch": 0.9946704067321178, + "grad_norm": 2.1376590647266593, + "learning_rate": 7.448797957526621e-10, + "loss": 0.3226, + "step": 7092 + }, + { + "epoch": 0.9948106591865358, + "grad_norm": 1.8907727270499315, + "learning_rate": 7.061923514506409e-10, + "loss": 0.3257, + "step": 7093 + }, + { + "epoch": 0.9949509116409537, + "grad_norm": 1.7812308594863207, + "learning_rate": 6.685364764980051e-10, + "loss": 0.3105, + "step": 7094 + }, + { + "epoch": 0.9950911640953717, + "grad_norm": 1.9195794763589107, + "learning_rate": 6.319121786646509e-10, + "loss": 0.3249, + "step": 7095 + }, + { + "epoch": 0.9952314165497896, + "grad_norm": 1.9496716296208418, + "learning_rate": 5.963194655078663e-10, + "loss": 0.2845, + "step": 7096 + }, + { + "epoch": 0.9953716690042076, + "grad_norm": 2.1652600662108448, + "learning_rate": 5.617583443717767e-10, + "loss": 0.3012, + "step": 7097 + }, + { + "epoch": 0.9955119214586255, + "grad_norm": 1.7504815392783346, + "learning_rate": 5.282288223884546e-10, + "loss": 0.3593, + "step": 7098 + }, + { + "epoch": 0.9956521739130435, + "grad_norm": 1.8551711958163708, + "learning_rate": 4.957309064756998e-10, + "loss": 0.3236, + "step": 7099 + }, + { + "epoch": 0.9957924263674615, + "grad_norm": 2.1420189989460185, + "learning_rate": 4.642646033398146e-10, + "loss": 0.3509, + "step": 7100 + }, + { + "epoch": 0.9959326788218794, + "grad_norm": 1.6297799747084554, + "learning_rate": 4.3382991947338306e-10, + "loss": 0.298, + "step": 7101 + }, + { + "epoch": 0.9960729312762974, + "grad_norm": 1.8351976567806327, + "learning_rate": 4.0442686115582665e-10, + "loss": 0.3478, + "step": 7102 + }, + { + "epoch": 0.9962131837307153, + "grad_norm": 1.7828118687861776, + "learning_rate": 3.760554344556244e-10, + "loss": 0.2857, + "step": 7103 + }, + { + "epoch": 0.9963534361851333, + "grad_norm": 1.5356729429009237, + "learning_rate": 3.487156452258722e-10, + "loss": 0.3211, + "step": 7104 + }, + { + "epoch": 0.9964936886395512, + "grad_norm": 1.7639418149335162, + "learning_rate": 3.2240749910816825e-10, + "loss": 0.3706, + "step": 7105 + }, + { + "epoch": 0.9966339410939692, + "grad_norm": 1.8902128416641317, + "learning_rate": 2.971310015315032e-10, + "loss": 0.3144, + "step": 7106 + }, + { + "epoch": 0.9967741935483871, + "grad_norm": 2.4859166190602946, + "learning_rate": 2.7288615771114966e-10, + "loss": 0.3374, + "step": 7107 + }, + { + "epoch": 0.9969144460028051, + "grad_norm": 1.7699716453651817, + "learning_rate": 2.496729726497726e-10, + "loss": 0.3233, + "step": 7108 + }, + { + "epoch": 0.997054698457223, + "grad_norm": 3.8033556477201067, + "learning_rate": 2.274914511374293e-10, + "loss": 0.3465, + "step": 7109 + }, + { + "epoch": 0.9971949509116409, + "grad_norm": 2.006456444121876, + "learning_rate": 2.0634159775045904e-10, + "loss": 0.316, + "step": 7110 + }, + { + "epoch": 0.9973352033660589, + "grad_norm": 1.948212511032593, + "learning_rate": 1.8622341685425872e-10, + "loss": 0.3587, + "step": 7111 + }, + { + "epoch": 0.9974754558204768, + "grad_norm": 2.259944344907112, + "learning_rate": 1.6713691259939713e-10, + "loss": 0.3204, + "step": 7112 + }, + { + "epoch": 0.9976157082748948, + "grad_norm": 1.8447136381230682, + "learning_rate": 1.4908208892383536e-10, + "loss": 0.3639, + "step": 7113 + }, + { + "epoch": 0.9977559607293127, + "grad_norm": 1.8138665933814442, + "learning_rate": 1.3205894955348187e-10, + "loss": 0.3683, + "step": 7114 + }, + { + "epoch": 0.9978962131837307, + "grad_norm": 1.771872077414711, + "learning_rate": 1.160674980010823e-10, + "loss": 0.3283, + "step": 7115 + }, + { + "epoch": 0.9980364656381486, + "grad_norm": 1.932355881384, + "learning_rate": 1.011077375662195e-10, + "loss": 0.3214, + "step": 7116 + }, + { + "epoch": 0.9981767180925666, + "grad_norm": 1.8956369850290455, + "learning_rate": 8.717967133586857e-11, + "loss": 0.3423, + "step": 7117 + }, + { + "epoch": 0.9983169705469845, + "grad_norm": 1.566992899563982, + "learning_rate": 7.428330218384183e-11, + "loss": 0.2764, + "step": 7118 + }, + { + "epoch": 0.9984572230014025, + "grad_norm": 2.4810578855613885, + "learning_rate": 6.241863277078874e-11, + "loss": 0.3159, + "step": 7119 + }, + { + "epoch": 0.9985974754558204, + "grad_norm": 1.6707840684713395, + "learning_rate": 5.1585665545861305e-11, + "loss": 0.2777, + "step": 7120 + }, + { + "epoch": 0.9987377279102384, + "grad_norm": 3.090766624237503, + "learning_rate": 4.1784402743383357e-11, + "loss": 0.3819, + "step": 7121 + }, + { + "epoch": 0.9988779803646564, + "grad_norm": 2.2281827686412914, + "learning_rate": 3.301484638618124e-11, + "loss": 0.343, + "step": 7122 + }, + { + "epoch": 0.9990182328190743, + "grad_norm": 1.6322343158526365, + "learning_rate": 2.5276998284473608e-11, + "loss": 0.3062, + "step": 7123 + }, + { + "epoch": 0.9991584852734923, + "grad_norm": 1.5780935001062246, + "learning_rate": 1.857086003365094e-11, + "loss": 0.3153, + "step": 7124 + }, + { + "epoch": 0.9992987377279102, + "grad_norm": 2.536575836864684, + "learning_rate": 1.2896433018161348e-11, + "loss": 0.2997, + "step": 7125 + }, + { + "epoch": 0.9994389901823282, + "grad_norm": 2.1491291944065365, + "learning_rate": 8.253718408735013e-12, + "loss": 0.331, + "step": 7126 + }, + { + "epoch": 0.9995792426367461, + "grad_norm": 1.9257449133116333, + "learning_rate": 4.642717164049515e-12, + "loss": 0.2941, + "step": 7127 + }, + { + "epoch": 0.9997194950911641, + "grad_norm": 2.142527938714024, + "learning_rate": 2.063430027954283e-12, + "loss": 0.2826, + "step": 7128 + }, + { + "epoch": 0.999859747545582, + "grad_norm": 1.8741560626150489, + "learning_rate": 5.158575333563675e-13, + "loss": 0.3203, + "step": 7129 + }, + { + "epoch": 1.0, + "grad_norm": 2.348740008598381, + "learning_rate": 0.0, + "loss": 0.3445, + "step": 7130 + }, + { + "epoch": 1.0, + "step": 7130, + "total_flos": 2.3623676556921012e+19, + "train_loss": 0.0059655218313318815, + "train_runtime": 3517.247, + "train_samples_per_second": 583.832, + "train_steps_per_second": 2.027 + } + ], + "logging_steps": 1.0, + "max_steps": 7130, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.3623676556921012e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}