| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9995795432403884, | |
| "eval_steps": 500, | |
| "global_step": 1523, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.000656322746710695, | |
| "grad_norm": 1.0, | |
| "learning_rate": 1.3071895424836602e-06, | |
| "loss": 2.0939, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.003281613733553475, | |
| "grad_norm": 0.9765625, | |
| "learning_rate": 6.535947712418301e-06, | |
| "loss": 2.0618, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.00656322746710695, | |
| "grad_norm": 0.91796875, | |
| "learning_rate": 1.3071895424836602e-05, | |
| "loss": 2.0432, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.009844841200660424, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 1.9607843137254903e-05, | |
| "loss": 2.001, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.0131264549342139, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 2.6143790849673204e-05, | |
| "loss": 1.9571, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.016408068667767374, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 3.2679738562091506e-05, | |
| "loss": 1.8829, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.019689682401320848, | |
| "grad_norm": 0.90234375, | |
| "learning_rate": 3.9215686274509805e-05, | |
| "loss": 1.8114, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.022971296134874326, | |
| "grad_norm": 0.490234375, | |
| "learning_rate": 4.5751633986928104e-05, | |
| "loss": 1.6748, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.0262529098684278, | |
| "grad_norm": 0.3828125, | |
| "learning_rate": 5.228758169934641e-05, | |
| "loss": 1.665, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.029534523601981274, | |
| "grad_norm": 0.2099609375, | |
| "learning_rate": 5.882352941176471e-05, | |
| "loss": 1.6295, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.03281613733553475, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 6.535947712418301e-05, | |
| "loss": 1.6194, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.036097751069088226, | |
| "grad_norm": 0.18359375, | |
| "learning_rate": 7.189542483660131e-05, | |
| "loss": 1.5773, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.039379364802641696, | |
| "grad_norm": 0.140625, | |
| "learning_rate": 7.843137254901961e-05, | |
| "loss": 1.542, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.042660978536195174, | |
| "grad_norm": 0.126953125, | |
| "learning_rate": 8.496732026143791e-05, | |
| "loss": 1.5303, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.04594259226974865, | |
| "grad_norm": 0.1123046875, | |
| "learning_rate": 9.150326797385621e-05, | |
| "loss": 1.5031, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.04922420600330212, | |
| "grad_norm": 0.10595703125, | |
| "learning_rate": 9.80392156862745e-05, | |
| "loss": 1.5177, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.0525058197368556, | |
| "grad_norm": 0.09814453125, | |
| "learning_rate": 0.00010457516339869282, | |
| "loss": 1.5038, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.05578743347040908, | |
| "grad_norm": 0.09130859375, | |
| "learning_rate": 0.00011111111111111112, | |
| "loss": 1.4847, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.05906904720396255, | |
| "grad_norm": 0.0986328125, | |
| "learning_rate": 0.00011764705882352942, | |
| "loss": 1.5054, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.062350660937516025, | |
| "grad_norm": 0.09912109375, | |
| "learning_rate": 0.00012418300653594771, | |
| "loss": 1.4672, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.0656322746710695, | |
| "grad_norm": 0.10205078125, | |
| "learning_rate": 0.00013071895424836603, | |
| "loss": 1.4626, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.06891388840462297, | |
| "grad_norm": 0.1416015625, | |
| "learning_rate": 0.0001372549019607843, | |
| "loss": 1.451, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.07219550213817645, | |
| "grad_norm": 0.111328125, | |
| "learning_rate": 0.00014379084967320262, | |
| "loss": 1.4631, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.07547711587172992, | |
| "grad_norm": 0.10302734375, | |
| "learning_rate": 0.0001503267973856209, | |
| "loss": 1.4423, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.07875872960528339, | |
| "grad_norm": 0.1064453125, | |
| "learning_rate": 0.00015686274509803922, | |
| "loss": 1.4301, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.08204034333883688, | |
| "grad_norm": 0.1123046875, | |
| "learning_rate": 0.00016339869281045753, | |
| "loss": 1.4287, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.08532195707239035, | |
| "grad_norm": 0.162109375, | |
| "learning_rate": 0.00016993464052287582, | |
| "loss": 1.4294, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.08860357080594382, | |
| "grad_norm": 0.1240234375, | |
| "learning_rate": 0.00017647058823529413, | |
| "loss": 1.4201, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.0918851845394973, | |
| "grad_norm": 0.1494140625, | |
| "learning_rate": 0.00018300653594771241, | |
| "loss": 1.4044, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.09516679827305077, | |
| "grad_norm": 0.1357421875, | |
| "learning_rate": 0.00018954248366013073, | |
| "loss": 1.4114, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.09844841200660424, | |
| "grad_norm": 0.1552734375, | |
| "learning_rate": 0.000196078431372549, | |
| "loss": 1.3928, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.10173002574015773, | |
| "grad_norm": 0.162109375, | |
| "learning_rate": 0.0001999989483097553, | |
| "loss": 1.4054, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.1050116394737112, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 0.00019998711704854725, | |
| "loss": 1.3996, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.10829325320726467, | |
| "grad_norm": 0.15625, | |
| "learning_rate": 0.0001999621414738484, | |
| "loss": 1.4015, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.11157486694081815, | |
| "grad_norm": 0.1455078125, | |
| "learning_rate": 0.0001999240248689495, | |
| "loss": 1.387, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.11485648067437162, | |
| "grad_norm": 0.140625, | |
| "learning_rate": 0.00019987277224466215, | |
| "loss": 1.4057, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.1181380944079251, | |
| "grad_norm": 0.142578125, | |
| "learning_rate": 0.00019980839033865994, | |
| "loss": 1.3844, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.12141970814147857, | |
| "grad_norm": 0.146484375, | |
| "learning_rate": 0.00019973088761459287, | |
| "loss": 1.3654, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.12470132187503205, | |
| "grad_norm": 0.1591796875, | |
| "learning_rate": 0.00019964027426097448, | |
| "loss": 1.3632, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.12798293560858554, | |
| "grad_norm": 0.154296875, | |
| "learning_rate": 0.00019953656218984263, | |
| "loss": 1.3982, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.131264549342139, | |
| "grad_norm": 0.146484375, | |
| "learning_rate": 0.0001994197650351936, | |
| "loss": 1.3842, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.13454616307569248, | |
| "grad_norm": 0.1640625, | |
| "learning_rate": 0.0001992898981511896, | |
| "loss": 1.3528, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.13782777680924593, | |
| "grad_norm": 0.173828125, | |
| "learning_rate": 0.0001991469786101404, | |
| "loss": 1.3811, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.14110939054279942, | |
| "grad_norm": 0.162109375, | |
| "learning_rate": 0.00019899102520025896, | |
| "loss": 1.3594, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.1443910042763529, | |
| "grad_norm": 0.1865234375, | |
| "learning_rate": 0.0001988220584231916, | |
| "loss": 1.3675, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.14767261800990636, | |
| "grad_norm": 0.15234375, | |
| "learning_rate": 0.00019864010049132287, | |
| "loss": 1.3532, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.15095423174345984, | |
| "grad_norm": 0.1455078125, | |
| "learning_rate": 0.0001984451753248553, | |
| "loss": 1.3672, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.15423584547701333, | |
| "grad_norm": 0.1689453125, | |
| "learning_rate": 0.0001982373085486651, | |
| "loss": 1.3553, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.15751745921056678, | |
| "grad_norm": 0.1396484375, | |
| "learning_rate": 0.00019801652748893347, | |
| "loss": 1.3303, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.16079907294412027, | |
| "grad_norm": 0.1396484375, | |
| "learning_rate": 0.00019778286116955407, | |
| "loss": 1.35, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.16408068667767375, | |
| "grad_norm": 0.1484375, | |
| "learning_rate": 0.00019753634030831782, | |
| "loss": 1.3451, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.1673623004112272, | |
| "grad_norm": 0.1357421875, | |
| "learning_rate": 0.00019727699731287465, | |
| "loss": 1.3604, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.1706439141447807, | |
| "grad_norm": 0.15625, | |
| "learning_rate": 0.00019700486627647305, | |
| "loss": 1.3418, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.17392552787833418, | |
| "grad_norm": 0.171875, | |
| "learning_rate": 0.0001967199829734784, | |
| "loss": 1.3463, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.17720714161188764, | |
| "grad_norm": 0.1533203125, | |
| "learning_rate": 0.00019642238485466989, | |
| "loss": 1.357, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.18048875534544112, | |
| "grad_norm": 0.1455078125, | |
| "learning_rate": 0.00019611211104231724, | |
| "loss": 1.3678, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.1837703690789946, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 0.0001957892023250379, | |
| "loss": 1.3296, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.18705198281254806, | |
| "grad_norm": 0.154296875, | |
| "learning_rate": 0.00019545370115243462, | |
| "loss": 1.3445, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.19033359654610155, | |
| "grad_norm": 0.146484375, | |
| "learning_rate": 0.00019510565162951537, | |
| "loss": 1.3407, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.19361521027965503, | |
| "grad_norm": 0.1484375, | |
| "learning_rate": 0.00019474509951089507, | |
| "loss": 1.3419, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.1968968240132085, | |
| "grad_norm": 0.1357421875, | |
| "learning_rate": 0.00019437209219478084, | |
| "loss": 1.349, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.20017843774676197, | |
| "grad_norm": 0.1728515625, | |
| "learning_rate": 0.00019398667871674082, | |
| "loss": 1.3282, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.20346005148031546, | |
| "grad_norm": 0.1669921875, | |
| "learning_rate": 0.00019358890974325817, | |
| "loss": 1.3115, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.2067416652138689, | |
| "grad_norm": 0.1396484375, | |
| "learning_rate": 0.00019317883756507026, | |
| "loss": 1.339, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.2100232789474224, | |
| "grad_norm": 0.1435546875, | |
| "learning_rate": 0.0001927565160902948, | |
| "loss": 1.3186, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.21330489268097588, | |
| "grad_norm": 0.1494140625, | |
| "learning_rate": 0.00019232200083734265, | |
| "loss": 1.3434, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.21658650641452934, | |
| "grad_norm": 0.1455078125, | |
| "learning_rate": 0.00019187534892761986, | |
| "loss": 1.3186, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.21986812014808282, | |
| "grad_norm": 0.1435546875, | |
| "learning_rate": 0.0001914166190780181, | |
| "loss": 1.3394, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.2231497338816363, | |
| "grad_norm": 0.1416015625, | |
| "learning_rate": 0.00019094587159319585, | |
| "loss": 1.317, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.22643134761518977, | |
| "grad_norm": 0.142578125, | |
| "learning_rate": 0.00019046316835765083, | |
| "loss": 1.3344, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.22971296134874325, | |
| "grad_norm": 0.1650390625, | |
| "learning_rate": 0.00018996857282758462, | |
| "loss": 1.3402, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.2329945750822967, | |
| "grad_norm": 0.1455078125, | |
| "learning_rate": 0.00018946215002256061, | |
| "loss": 1.3396, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.2362761888158502, | |
| "grad_norm": 0.1591796875, | |
| "learning_rate": 0.00018894396651695662, | |
| "loss": 1.3289, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.23955780254940368, | |
| "grad_norm": 0.1396484375, | |
| "learning_rate": 0.00018841409043121306, | |
| "loss": 1.3274, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.24283941628295713, | |
| "grad_norm": 0.1416015625, | |
| "learning_rate": 0.0001878725914228776, | |
| "loss": 1.3312, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.24612103001651062, | |
| "grad_norm": 0.14453125, | |
| "learning_rate": 0.00018731954067744834, | |
| "loss": 1.3258, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.2494026437500641, | |
| "grad_norm": 0.150390625, | |
| "learning_rate": 0.00018675501089901542, | |
| "loss": 1.3224, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.25268425748361756, | |
| "grad_norm": 0.1474609375, | |
| "learning_rate": 0.00018617907630070352, | |
| "loss": 1.3219, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.25596587121717107, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 0.0001855918125949157, | |
| "loss": 1.3191, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.2592474849507245, | |
| "grad_norm": 0.1435546875, | |
| "learning_rate": 0.00018499329698338035, | |
| "loss": 1.3183, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.262529098684278, | |
| "grad_norm": 0.1474609375, | |
| "learning_rate": 0.0001843836081470022, | |
| "loss": 1.3154, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.2658107124178315, | |
| "grad_norm": 0.142578125, | |
| "learning_rate": 0.0001837628262355188, | |
| "loss": 1.3131, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.26909232615138495, | |
| "grad_norm": 0.14453125, | |
| "learning_rate": 0.00018313103285696425, | |
| "loss": 1.3211, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.2723739398849384, | |
| "grad_norm": 0.150390625, | |
| "learning_rate": 0.00018248831106694086, | |
| "loss": 1.3082, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.27565555361849187, | |
| "grad_norm": 0.15234375, | |
| "learning_rate": 0.00018183474535770068, | |
| "loss": 1.3046, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.2789371673520454, | |
| "grad_norm": 0.1748046875, | |
| "learning_rate": 0.00018117042164703814, | |
| "loss": 1.3026, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.28221878108559884, | |
| "grad_norm": 0.1484375, | |
| "learning_rate": 0.00018049542726699533, | |
| "loss": 1.3265, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.2855003948191523, | |
| "grad_norm": 0.142578125, | |
| "learning_rate": 0.00017980985095238124, | |
| "loss": 1.2958, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.2887820085527058, | |
| "grad_norm": 0.1767578125, | |
| "learning_rate": 0.00017911378282910675, | |
| "loss": 1.311, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.29206362228625926, | |
| "grad_norm": 0.138671875, | |
| "learning_rate": 0.00017840731440233674, | |
| "loss": 1.3126, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.2953452360198127, | |
| "grad_norm": 0.1435546875, | |
| "learning_rate": 0.00017769053854446053, | |
| "loss": 1.303, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.29862684975336623, | |
| "grad_norm": 0.1484375, | |
| "learning_rate": 0.00017696354948288327, | |
| "loss": 1.3106, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.3019084634869197, | |
| "grad_norm": 0.14453125, | |
| "learning_rate": 0.00017622644278763843, | |
| "loss": 1.2993, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.30519007722047314, | |
| "grad_norm": 0.1455078125, | |
| "learning_rate": 0.00017547931535882445, | |
| "loss": 1.3031, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.30847169095402666, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 0.000174722265413866, | |
| "loss": 1.3175, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.3117533046875801, | |
| "grad_norm": 0.1484375, | |
| "learning_rate": 0.0001739553924746025, | |
| "loss": 1.3086, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.31503491842113357, | |
| "grad_norm": 0.1669921875, | |
| "learning_rate": 0.0001731787973542049, | |
| "loss": 1.3364, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.3183165321546871, | |
| "grad_norm": 0.150390625, | |
| "learning_rate": 0.0001723925821439227, | |
| "loss": 1.3103, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.32159814588824054, | |
| "grad_norm": 0.13671875, | |
| "learning_rate": 0.00017159685019966316, | |
| "loss": 1.3312, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.324879759621794, | |
| "grad_norm": 0.142578125, | |
| "learning_rate": 0.00017079170612840404, | |
| "loss": 1.3064, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.3281613733553475, | |
| "grad_norm": 0.1357421875, | |
| "learning_rate": 0.00016997725577444205, | |
| "loss": 1.3109, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.33144298708890096, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 0.0001691536062054783, | |
| "loss": 1.3083, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.3347246008224544, | |
| "grad_norm": 0.1376953125, | |
| "learning_rate": 0.0001683208656985436, | |
| "loss": 1.2997, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.33800621455600793, | |
| "grad_norm": 0.140625, | |
| "learning_rate": 0.00016747914372576393, | |
| "loss": 1.3161, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.3412878282895614, | |
| "grad_norm": 0.1376953125, | |
| "learning_rate": 0.00016662855093996945, | |
| "loss": 1.2811, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.34456944202311485, | |
| "grad_norm": 0.14453125, | |
| "learning_rate": 0.00016576919916014808, | |
| "loss": 1.3146, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.34785105575666836, | |
| "grad_norm": 0.13671875, | |
| "learning_rate": 0.00016490120135674566, | |
| "loss": 1.2922, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.3511326694902218, | |
| "grad_norm": 0.146484375, | |
| "learning_rate": 0.00016402467163681493, | |
| "loss": 1.2962, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.3544142832237753, | |
| "grad_norm": 0.138671875, | |
| "learning_rate": 0.00016313972522901491, | |
| "loss": 1.3008, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.3576958969573288, | |
| "grad_norm": 0.1455078125, | |
| "learning_rate": 0.00016224647846846315, | |
| "loss": 1.3042, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.36097751069088224, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 0.00016134504878144204, | |
| "loss": 1.2876, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.3642591244244357, | |
| "grad_norm": 0.1474609375, | |
| "learning_rate": 0.00016043555466996206, | |
| "loss": 1.3086, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.3675407381579892, | |
| "grad_norm": 0.1513671875, | |
| "learning_rate": 0.0001595181156961836, | |
| "loss": 1.2894, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.37082235189154267, | |
| "grad_norm": 0.1357421875, | |
| "learning_rate": 0.00015859285246669913, | |
| "loss": 1.2933, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.3741039656250961, | |
| "grad_norm": 0.1435546875, | |
| "learning_rate": 0.00015765988661667834, | |
| "loss": 1.2885, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.37738557935864964, | |
| "grad_norm": 0.150390625, | |
| "learning_rate": 0.00015671934079387797, | |
| "loss": 1.2719, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.3806671930922031, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 0.00015577133864251848, | |
| "loss": 1.3012, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.38394880682575655, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 0.00015481600478702996, | |
| "loss": 1.3046, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.38723042055931006, | |
| "grad_norm": 0.142578125, | |
| "learning_rate": 0.0001538534648156686, | |
| "loss": 1.3017, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.3905120342928635, | |
| "grad_norm": 0.13671875, | |
| "learning_rate": 0.00015288384526400734, | |
| "loss": 1.307, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.393793648026417, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 0.00015190727359830109, | |
| "loss": 1.2755, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.3970752617599705, | |
| "grad_norm": 0.1357421875, | |
| "learning_rate": 0.00015092387819873014, | |
| "loss": 1.2961, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.40035687549352394, | |
| "grad_norm": 0.1416015625, | |
| "learning_rate": 0.0001499337883425235, | |
| "loss": 1.2989, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.4036384892270774, | |
| "grad_norm": 0.138671875, | |
| "learning_rate": 0.0001489371341869638, | |
| "loss": 1.2897, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.4069201029606309, | |
| "grad_norm": 0.1435546875, | |
| "learning_rate": 0.00014793404675227684, | |
| "loss": 1.3068, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.41020171669418437, | |
| "grad_norm": 0.1416015625, | |
| "learning_rate": 0.00014692465790440792, | |
| "loss": 1.28, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.4134833304277378, | |
| "grad_norm": 0.13671875, | |
| "learning_rate": 0.0001459091003376865, | |
| "loss": 1.274, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.41676494416129134, | |
| "grad_norm": 0.1474609375, | |
| "learning_rate": 0.00014488750755738223, | |
| "loss": 1.3007, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.4200465578948448, | |
| "grad_norm": 0.14453125, | |
| "learning_rate": 0.00014386001386215434, | |
| "loss": 1.2852, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.42332817162839825, | |
| "grad_norm": 0.138671875, | |
| "learning_rate": 0.0001428267543263969, | |
| "loss": 1.2861, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.42660978536195177, | |
| "grad_norm": 0.1376953125, | |
| "learning_rate": 0.00014178786478248162, | |
| "loss": 1.2873, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.4298913990955052, | |
| "grad_norm": 0.140625, | |
| "learning_rate": 0.0001407434818029015, | |
| "loss": 1.283, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.4331730128290587, | |
| "grad_norm": 0.1376953125, | |
| "learning_rate": 0.00013969374268231713, | |
| "loss": 1.2828, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.4364546265626122, | |
| "grad_norm": 0.1376953125, | |
| "learning_rate": 0.0001386387854195076, | |
| "loss": 1.2577, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.43973624029616565, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 0.0001375787486992294, | |
| "loss": 1.3003, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.4430178540297191, | |
| "grad_norm": 0.1435546875, | |
| "learning_rate": 0.00013651377187398492, | |
| "loss": 1.2879, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.4462994677632726, | |
| "grad_norm": 0.140625, | |
| "learning_rate": 0.00013544399494570307, | |
| "loss": 1.2947, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.4495810814968261, | |
| "grad_norm": 0.1376953125, | |
| "learning_rate": 0.0001343695585473346, | |
| "loss": 1.263, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.45286269523037953, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 0.00013329060392436456, | |
| "loss": 1.2842, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.456144308963933, | |
| "grad_norm": 0.14453125, | |
| "learning_rate": 0.00013220727291624415, | |
| "loss": 1.2789, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.4594259226974865, | |
| "grad_norm": 0.1396484375, | |
| "learning_rate": 0.00013111970793774439, | |
| "loss": 1.2638, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.46270753643103996, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 0.00013002805196023448, | |
| "loss": 1.2978, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.4659891501645934, | |
| "grad_norm": 0.13671875, | |
| "learning_rate": 0.0001289324484928865, | |
| "loss": 1.2863, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.4692707638981469, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 0.0001278330415638099, | |
| "loss": 1.2774, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.4725523776317004, | |
| "grad_norm": 0.1357421875, | |
| "learning_rate": 0.0001267299757011175, | |
| "loss": 1.2741, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.47583399136525384, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 0.00012562339591392572, | |
| "loss": 1.2904, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.47911560509880735, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 0.00012451344767329178, | |
| "loss": 1.2737, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.4823972188323608, | |
| "grad_norm": 0.140625, | |
| "learning_rate": 0.00012340027689309, | |
| "loss": 1.2958, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.48567883256591426, | |
| "grad_norm": 0.1435546875, | |
| "learning_rate": 0.0001222840299108301, | |
| "loss": 1.2914, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.4889604462994678, | |
| "grad_norm": 0.1435546875, | |
| "learning_rate": 0.0001211648534684194, | |
| "loss": 1.3006, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.49224206003302123, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 0.00012004289469287229, | |
| "loss": 1.2698, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.4955236737665747, | |
| "grad_norm": 0.146484375, | |
| "learning_rate": 0.00011891830107696891, | |
| "loss": 1.2954, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.4988052875001282, | |
| "grad_norm": 0.138671875, | |
| "learning_rate": 0.00011779122045986567, | |
| "loss": 1.2682, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.5020869012336817, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 0.00011666180100766036, | |
| "loss": 1.2779, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.5053685149672351, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 0.00011553019119391412, | |
| "loss": 1.2848, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.5086501287007886, | |
| "grad_norm": 0.1376953125, | |
| "learning_rate": 0.00011439653978013334, | |
| "loss": 1.2788, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.5119317424343421, | |
| "grad_norm": 0.146484375, | |
| "learning_rate": 0.0001132609957962131, | |
| "loss": 1.2696, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.5152133561678955, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 0.00011212370852084603, | |
| "loss": 1.272, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.518494969901449, | |
| "grad_norm": 0.13671875, | |
| "learning_rate": 0.00011098482746189786, | |
| "loss": 1.2893, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.5217765836350026, | |
| "grad_norm": 0.138671875, | |
| "learning_rate": 0.00010984450233675334, | |
| "loss": 1.2761, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.525058197368556, | |
| "grad_norm": 0.142578125, | |
| "learning_rate": 0.0001087028830526342, | |
| "loss": 1.2472, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.5283398111021095, | |
| "grad_norm": 0.1416015625, | |
| "learning_rate": 0.00010756011968689242, | |
| "loss": 1.2683, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.531621424835663, | |
| "grad_norm": 0.1396484375, | |
| "learning_rate": 0.00010641636246728095, | |
| "loss": 1.2535, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.5349030385692164, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 0.00010527176175220499, | |
| "loss": 1.2508, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.5381846523027699, | |
| "grad_norm": 0.138671875, | |
| "learning_rate": 0.0001041264680109556, | |
| "loss": 1.2801, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.5414662660363233, | |
| "grad_norm": 0.142578125, | |
| "learning_rate": 0.00010298063180392917, | |
| "loss": 1.2661, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.5447478797698768, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 0.0001018344037628346, | |
| "loss": 1.2817, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.5480294935034303, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 0.00010068793457089141, | |
| "loss": 1.2881, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.5513111072369837, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 9.954137494302079e-05, | |
| "loss": 1.2755, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.5545927209705372, | |
| "grad_norm": 0.1396484375, | |
| "learning_rate": 9.839487560603266e-05, | |
| "loss": 1.2741, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.5578743347040908, | |
| "grad_norm": 0.1357421875, | |
| "learning_rate": 9.724858727881107e-05, | |
| "loss": 1.304, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.5611559484376442, | |
| "grad_norm": 0.13671875, | |
| "learning_rate": 9.610266065250077e-05, | |
| "loss": 1.2742, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.5644375621711977, | |
| "grad_norm": 0.13671875, | |
| "learning_rate": 9.495724637069718e-05, | |
| "loss": 1.2988, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.5677191759047512, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 9.381249500964294e-05, | |
| "loss": 1.2753, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.5710007896383046, | |
| "grad_norm": 0.1376953125, | |
| "learning_rate": 9.266855705843309e-05, | |
| "loss": 1.273, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.5742824033718581, | |
| "grad_norm": 0.13671875, | |
| "learning_rate": 9.152558289923177e-05, | |
| "loss": 1.2702, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.5775640171054116, | |
| "grad_norm": 0.1396484375, | |
| "learning_rate": 9.038372278750287e-05, | |
| "loss": 1.2854, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.580845630838965, | |
| "grad_norm": 0.140625, | |
| "learning_rate": 8.92431268322576e-05, | |
| "loss": 1.285, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.5841272445725185, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 8.810394497632102e-05, | |
| "loss": 1.2743, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.587408858306072, | |
| "grad_norm": 0.142578125, | |
| "learning_rate": 8.696632697662063e-05, | |
| "loss": 1.2741, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.5906904720396254, | |
| "grad_norm": 0.13671875, | |
| "learning_rate": 8.58304223844993e-05, | |
| "loss": 1.2847, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.593972085773179, | |
| "grad_norm": 0.1376953125, | |
| "learning_rate": 8.469638052605513e-05, | |
| "loss": 1.2753, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.5972536995067325, | |
| "grad_norm": 0.138671875, | |
| "learning_rate": 8.356435048251126e-05, | |
| "loss": 1.2679, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.6005353132402859, | |
| "grad_norm": 0.1650390625, | |
| "learning_rate": 8.243448107061729e-05, | |
| "loss": 1.2631, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.6038169269738394, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 8.130692082308624e-05, | |
| "loss": 1.2655, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.6070985407073929, | |
| "grad_norm": 0.130859375, | |
| "learning_rate": 8.01818179690681e-05, | |
| "loss": 1.3186, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.6103801544409463, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 7.90593204146638e-05, | |
| "loss": 1.2895, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.6136617681744998, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 7.793957572348131e-05, | |
| "loss": 1.2751, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.6169433819080533, | |
| "grad_norm": 0.13671875, | |
| "learning_rate": 7.682273109723712e-05, | |
| "loss": 1.2663, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.6202249956416067, | |
| "grad_norm": 0.1357421875, | |
| "learning_rate": 7.570893335640487e-05, | |
| "loss": 1.2706, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.6235066093751602, | |
| "grad_norm": 0.13671875, | |
| "learning_rate": 7.459832892091455e-05, | |
| "loss": 1.2638, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.6267882231087137, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 7.349106379090381e-05, | |
| "loss": 1.275, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.6300698368422671, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 7.23872835275252e-05, | |
| "loss": 1.272, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.6333514505758207, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 7.128713323381032e-05, | |
| "loss": 1.2768, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.6366330643093742, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 7.019075753559468e-05, | |
| "loss": 1.2743, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.6399146780429276, | |
| "grad_norm": 0.13671875, | |
| "learning_rate": 6.909830056250527e-05, | |
| "loss": 1.2707, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.6431962917764811, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 6.800990592901315e-05, | |
| "loss": 1.2844, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.6464779055100346, | |
| "grad_norm": 0.1376953125, | |
| "learning_rate": 6.692571671555398e-05, | |
| "loss": 1.264, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 0.649759519243588, | |
| "grad_norm": 0.1435546875, | |
| "learning_rate": 6.584587544971854e-05, | |
| "loss": 1.2481, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.6530411329771415, | |
| "grad_norm": 0.130859375, | |
| "learning_rate": 6.477052408751616e-05, | |
| "loss": 1.2738, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 0.656322746710695, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 6.369980399471306e-05, | |
| "loss": 1.2806, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.6596043604442484, | |
| "grad_norm": 0.13671875, | |
| "learning_rate": 6.263385592824857e-05, | |
| "loss": 1.2911, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 0.6628859741778019, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 6.157282001773095e-05, | |
| "loss": 1.2794, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.6661675879113554, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 6.051683574701616e-05, | |
| "loss": 1.2664, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 0.6694492016449088, | |
| "grad_norm": 0.142578125, | |
| "learning_rate": 5.946604193587134e-05, | |
| "loss": 1.2674, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.6727308153784624, | |
| "grad_norm": 0.1357421875, | |
| "learning_rate": 5.842057672172525e-05, | |
| "loss": 1.2696, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.6760124291120159, | |
| "grad_norm": 0.138671875, | |
| "learning_rate": 5.738057754150905e-05, | |
| "loss": 1.2657, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.6792940428455693, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 5.634618111358865e-05, | |
| "loss": 1.2726, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 0.6825756565791228, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 5.531752341979173e-05, | |
| "loss": 1.2842, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.6858572703126763, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 5.429473968753157e-05, | |
| "loss": 1.265, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 0.6891388840462297, | |
| "grad_norm": 0.13671875, | |
| "learning_rate": 5.327796437203019e-05, | |
| "loss": 1.2795, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.6924204977797832, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 5.226733113864242e-05, | |
| "loss": 1.2817, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 0.6957021115133367, | |
| "grad_norm": 0.13671875, | |
| "learning_rate": 5.126297284528485e-05, | |
| "loss": 1.2538, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.6989837252468901, | |
| "grad_norm": 0.1416015625, | |
| "learning_rate": 5.0265021524969857e-05, | |
| "loss": 1.2608, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 0.7022653389804436, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 4.927360836844868e-05, | |
| "loss": 1.2743, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.7055469527139971, | |
| "grad_norm": 0.1357421875, | |
| "learning_rate": 4.82888637069651e-05, | |
| "loss": 1.2725, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.7088285664475505, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 4.731091699512215e-05, | |
| "loss": 1.2578, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.7121101801811041, | |
| "grad_norm": 0.1357421875, | |
| "learning_rate": 4.6339896793863804e-05, | |
| "loss": 1.2784, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 0.7153917939146576, | |
| "grad_norm": 0.13671875, | |
| "learning_rate": 4.537593075357451e-05, | |
| "loss": 1.2708, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.718673407648211, | |
| "grad_norm": 0.13671875, | |
| "learning_rate": 4.441914559729825e-05, | |
| "loss": 1.2797, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 0.7219550213817645, | |
| "grad_norm": 0.13671875, | |
| "learning_rate": 4.346966710407937e-05, | |
| "loss": 1.3013, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.725236635115318, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 4.2527620092428e-05, | |
| "loss": 1.2535, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 0.7285182488488714, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 4.159312840391086e-05, | |
| "loss": 1.2779, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.7317998625824249, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 4.066631488687166e-05, | |
| "loss": 1.2659, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 0.7350814763159784, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 3.974730138028095e-05, | |
| "loss": 1.2653, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.7383630900495318, | |
| "grad_norm": 0.130859375, | |
| "learning_rate": 3.883620869771943e-05, | |
| "loss": 1.2735, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.7416447037830853, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 3.79331566114957e-05, | |
| "loss": 1.2653, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.7449263175166388, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 3.703826383690099e-05, | |
| "loss": 1.262, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 0.7482079312501922, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 3.6151648016602794e-05, | |
| "loss": 1.2491, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.7514895449837458, | |
| "grad_norm": 0.1357421875, | |
| "learning_rate": 3.527342570517975e-05, | |
| "loss": 1.2551, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 0.7547711587172993, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 3.44037123537991e-05, | |
| "loss": 1.2605, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.7580527724508527, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 3.3542622295039593e-05, | |
| "loss": 1.2621, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 0.7613343861844062, | |
| "grad_norm": 0.12890625, | |
| "learning_rate": 3.269026872786145e-05, | |
| "loss": 1.2798, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.7646159999179597, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 3.184676370272488e-05, | |
| "loss": 1.2823, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 0.7678976136515131, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 3.1012218106860345e-05, | |
| "loss": 1.284, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.7711792273850666, | |
| "grad_norm": 0.1357421875, | |
| "learning_rate": 3.0186741649690963e-05, | |
| "loss": 1.2825, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.7744608411186201, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 2.937044284841026e-05, | |
| "loss": 1.2561, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.7777424548521735, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 2.8563429013716514e-05, | |
| "loss": 1.2587, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 0.781024068585727, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 2.7765806235705594e-05, | |
| "loss": 1.2545, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.7843056823192806, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 2.6977679369924357e-05, | |
| "loss": 1.2553, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 0.787587296052834, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 2.6199152023586503e-05, | |
| "loss": 1.2713, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.7908689097863875, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 2.5430326541952087e-05, | |
| "loss": 1.2593, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 0.794150523519941, | |
| "grad_norm": 0.13671875, | |
| "learning_rate": 2.4671303994873373e-05, | |
| "loss": 1.2509, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.7974321372534944, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 2.3922184163508254e-05, | |
| "loss": 1.2682, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 0.8007137509870479, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 2.3183065527202718e-05, | |
| "loss": 1.2596, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.8039953647206014, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 2.245404525054515e-05, | |
| "loss": 1.2634, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.8072769784541548, | |
| "grad_norm": 0.130859375, | |
| "learning_rate": 2.1735219170592734e-05, | |
| "loss": 1.2717, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.8105585921877083, | |
| "grad_norm": 0.1357421875, | |
| "learning_rate": 2.1026681784272872e-05, | |
| "loss": 1.2607, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 0.8138402059212618, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 2.0328526235960565e-05, | |
| "loss": 1.2733, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.8171218196548152, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 1.9640844305233642e-05, | |
| "loss": 1.2696, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 0.8204034333883687, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 1.8963726394807424e-05, | |
| "loss": 1.2779, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.8236850471219223, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 1.8297261518650456e-05, | |
| "loss": 1.2668, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 0.8269666608554757, | |
| "grad_norm": 0.130859375, | |
| "learning_rate": 1.7641537290282472e-05, | |
| "loss": 1.2646, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.8302482745890292, | |
| "grad_norm": 0.130859375, | |
| "learning_rate": 1.699663991125705e-05, | |
| "loss": 1.2696, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 0.8335298883225827, | |
| "grad_norm": 0.12890625, | |
| "learning_rate": 1.636265415982936e-05, | |
| "loss": 1.2604, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.8368115020561361, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 1.5739663379811122e-05, | |
| "loss": 1.2664, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.8400931157896896, | |
| "grad_norm": 0.130859375, | |
| "learning_rate": 1.512774946961445e-05, | |
| "loss": 1.2804, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.8433747295232431, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 1.4526992871485345e-05, | |
| "loss": 1.2641, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 0.8466563432567965, | |
| "grad_norm": 0.1298828125, | |
| "learning_rate": 1.3937472560928733e-05, | |
| "loss": 1.2795, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.84993795699035, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 1.3359266036326412e-05, | |
| "loss": 1.2659, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 0.8532195707239035, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 1.2792449308749076e-05, | |
| "loss": 1.2643, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.8565011844574569, | |
| "grad_norm": 0.1298828125, | |
| "learning_rate": 1.2237096891963862e-05, | |
| "loss": 1.2812, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 0.8597827981910104, | |
| "grad_norm": 0.1298828125, | |
| "learning_rate": 1.1693281792638877e-05, | |
| "loss": 1.2669, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.863064411924564, | |
| "grad_norm": 0.1298828125, | |
| "learning_rate": 1.1161075500745543e-05, | |
| "loss": 1.2734, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 0.8663460256581174, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 1.0640547980160742e-05, | |
| "loss": 1.2607, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.8696276393916709, | |
| "grad_norm": 0.1298828125, | |
| "learning_rate": 1.0131767659469205e-05, | |
| "loss": 1.2717, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.8729092531252244, | |
| "grad_norm": 0.1376953125, | |
| "learning_rate": 9.634801422967887e-06, | |
| "loss": 1.2767, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.8761908668587778, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 9.149714601873516e-06, | |
| "loss": 1.274, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 0.8794724805923313, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 8.67657096573391e-06, | |
| "loss": 1.2553, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.8827540943258848, | |
| "grad_norm": 0.1279296875, | |
| "learning_rate": 8.215432714045024e-06, | |
| "loss": 1.2758, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 0.8860357080594382, | |
| "grad_norm": 0.1279296875, | |
| "learning_rate": 7.766360468074074e-06, | |
| "loss": 1.288, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.8893173217929917, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 7.32941326289035e-06, | |
| "loss": 1.2421, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 0.8925989355265452, | |
| "grad_norm": 0.130859375, | |
| "learning_rate": 6.904648539604364e-06, | |
| "loss": 1.2517, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.8958805492600986, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 6.4921221378167915e-06, | |
| "loss": 1.2712, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 0.8991621629936521, | |
| "grad_norm": 0.12890625, | |
| "learning_rate": 6.091888288277569e-06, | |
| "loss": 1.264, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.9024437767272055, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 5.70399960575696e-06, | |
| "loss": 1.2713, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.9057253904607591, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 5.328507082128642e-06, | |
| "loss": 1.272, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.9090070041943126, | |
| "grad_norm": 0.12890625, | |
| "learning_rate": 4.965460079666362e-06, | |
| "loss": 1.2672, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 0.912288617927866, | |
| "grad_norm": 0.1298828125, | |
| "learning_rate": 4.61490632455478e-06, | |
| "loss": 1.2732, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.9155702316614195, | |
| "grad_norm": 0.130859375, | |
| "learning_rate": 4.2768919006153876e-06, | |
| "loss": 1.2467, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 0.918851845394973, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 3.951461243248311e-06, | |
| "loss": 1.2634, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.9221334591285264, | |
| "grad_norm": 0.130859375, | |
| "learning_rate": 3.638657133590817e-06, | |
| "loss": 1.2571, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 0.9254150728620799, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 3.3385206928933097e-06, | |
| "loss": 1.2528, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.9286966865956334, | |
| "grad_norm": 0.130859375, | |
| "learning_rate": 3.0510913771135463e-06, | |
| "loss": 1.2647, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 0.9319783003291868, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 2.7764069717297724e-06, | |
| "loss": 1.2769, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.9352599140627403, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 2.5145035867733312e-06, | |
| "loss": 1.2616, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.9385415277962939, | |
| "grad_norm": 0.1357421875, | |
| "learning_rate": 2.265415652081804e-06, | |
| "loss": 1.2698, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.9418231415298473, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 2.0291759127727294e-06, | |
| "loss": 1.2415, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 0.9451047552634008, | |
| "grad_norm": 0.1279296875, | |
| "learning_rate": 1.8058154249389502e-06, | |
| "loss": 1.2907, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.9483863689969543, | |
| "grad_norm": 0.130859375, | |
| "learning_rate": 1.5953635515660425e-06, | |
| "loss": 1.2786, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 0.9516679827305077, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 1.3978479586721716e-06, | |
| "loss": 1.2634, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.9549495964640612, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 1.2132946116711897e-06, | |
| "loss": 1.2866, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 0.9582312101976147, | |
| "grad_norm": 0.12890625, | |
| "learning_rate": 1.0417277719591667e-06, | |
| "loss": 1.2671, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.9615128239311681, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 8.831699937249859e-07, | |
| "loss": 1.251, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 0.9647944376647216, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 7.376421209854267e-07, | |
| "loss": 1.2793, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.9680760513982751, | |
| "grad_norm": 0.126953125, | |
| "learning_rate": 6.051632848449562e-07, | |
| "loss": 1.2684, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.9713576651318285, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 4.857509009807304e-07, | |
| "loss": 1.2605, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.974639278865382, | |
| "grad_norm": 0.130859375, | |
| "learning_rate": 3.7942066735321414e-07, | |
| "loss": 1.2608, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 0.9779208925989356, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 2.861865621424431e-07, | |
| "loss": 1.2735, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.981202506332489, | |
| "grad_norm": 0.130859375, | |
| "learning_rate": 2.060608419105048e-07, | |
| "loss": 1.2788, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 0.9844841200660425, | |
| "grad_norm": 0.2451171875, | |
| "learning_rate": 1.3905403999024957e-07, | |
| "loss": 1.264, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.987765733799596, | |
| "grad_norm": 0.130859375, | |
| "learning_rate": 8.517496510059841e-08, | |
| "loss": 1.2673, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 0.9910473475331494, | |
| "grad_norm": 0.12890625, | |
| "learning_rate": 4.4430700188569095e-08, | |
| "loss": 1.2753, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.9943289612667029, | |
| "grad_norm": 0.130859375, | |
| "learning_rate": 1.6826601498098894e-08, | |
| "loss": 1.2567, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 0.9976105750002564, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 2.3662978659633183e-09, | |
| "loss": 1.2568, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.9995795432403884, | |
| "eval_loss": 1.4362765550613403, | |
| "eval_runtime": 1174.833, | |
| "eval_samples_per_second": 12.068, | |
| "eval_steps_per_second": 12.068, | |
| "step": 1523 | |
| }, | |
| { | |
| "epoch": 0.9995795432403884, | |
| "step": 1523, | |
| "total_flos": 2.6010044317889987e+18, | |
| "train_loss": 1.1185581020360233, | |
| "train_runtime": 52635.226, | |
| "train_samples_per_second": 3.705, | |
| "train_steps_per_second": 0.029 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1523, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 20, | |
| "total_flos": 2.6010044317889987e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |