openhermes-1b-olmo-sft-qlora / trainer_state.json
Ritvik19's picture
Upload 12 files
a840386 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9995795432403884,
"eval_steps": 500,
"global_step": 1523,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.000656322746710695,
"grad_norm": 1.0,
"learning_rate": 1.3071895424836602e-06,
"loss": 2.0939,
"step": 1
},
{
"epoch": 0.003281613733553475,
"grad_norm": 0.9765625,
"learning_rate": 6.535947712418301e-06,
"loss": 2.0618,
"step": 5
},
{
"epoch": 0.00656322746710695,
"grad_norm": 0.91796875,
"learning_rate": 1.3071895424836602e-05,
"loss": 2.0432,
"step": 10
},
{
"epoch": 0.009844841200660424,
"grad_norm": 0.71484375,
"learning_rate": 1.9607843137254903e-05,
"loss": 2.001,
"step": 15
},
{
"epoch": 0.0131264549342139,
"grad_norm": 0.578125,
"learning_rate": 2.6143790849673204e-05,
"loss": 1.9571,
"step": 20
},
{
"epoch": 0.016408068667767374,
"grad_norm": 0.5625,
"learning_rate": 3.2679738562091506e-05,
"loss": 1.8829,
"step": 25
},
{
"epoch": 0.019689682401320848,
"grad_norm": 0.90234375,
"learning_rate": 3.9215686274509805e-05,
"loss": 1.8114,
"step": 30
},
{
"epoch": 0.022971296134874326,
"grad_norm": 0.490234375,
"learning_rate": 4.5751633986928104e-05,
"loss": 1.6748,
"step": 35
},
{
"epoch": 0.0262529098684278,
"grad_norm": 0.3828125,
"learning_rate": 5.228758169934641e-05,
"loss": 1.665,
"step": 40
},
{
"epoch": 0.029534523601981274,
"grad_norm": 0.2099609375,
"learning_rate": 5.882352941176471e-05,
"loss": 1.6295,
"step": 45
},
{
"epoch": 0.03281613733553475,
"grad_norm": 0.2314453125,
"learning_rate": 6.535947712418301e-05,
"loss": 1.6194,
"step": 50
},
{
"epoch": 0.036097751069088226,
"grad_norm": 0.18359375,
"learning_rate": 7.189542483660131e-05,
"loss": 1.5773,
"step": 55
},
{
"epoch": 0.039379364802641696,
"grad_norm": 0.140625,
"learning_rate": 7.843137254901961e-05,
"loss": 1.542,
"step": 60
},
{
"epoch": 0.042660978536195174,
"grad_norm": 0.126953125,
"learning_rate": 8.496732026143791e-05,
"loss": 1.5303,
"step": 65
},
{
"epoch": 0.04594259226974865,
"grad_norm": 0.1123046875,
"learning_rate": 9.150326797385621e-05,
"loss": 1.5031,
"step": 70
},
{
"epoch": 0.04922420600330212,
"grad_norm": 0.10595703125,
"learning_rate": 9.80392156862745e-05,
"loss": 1.5177,
"step": 75
},
{
"epoch": 0.0525058197368556,
"grad_norm": 0.09814453125,
"learning_rate": 0.00010457516339869282,
"loss": 1.5038,
"step": 80
},
{
"epoch": 0.05578743347040908,
"grad_norm": 0.09130859375,
"learning_rate": 0.00011111111111111112,
"loss": 1.4847,
"step": 85
},
{
"epoch": 0.05906904720396255,
"grad_norm": 0.0986328125,
"learning_rate": 0.00011764705882352942,
"loss": 1.5054,
"step": 90
},
{
"epoch": 0.062350660937516025,
"grad_norm": 0.09912109375,
"learning_rate": 0.00012418300653594771,
"loss": 1.4672,
"step": 95
},
{
"epoch": 0.0656322746710695,
"grad_norm": 0.10205078125,
"learning_rate": 0.00013071895424836603,
"loss": 1.4626,
"step": 100
},
{
"epoch": 0.06891388840462297,
"grad_norm": 0.1416015625,
"learning_rate": 0.0001372549019607843,
"loss": 1.451,
"step": 105
},
{
"epoch": 0.07219550213817645,
"grad_norm": 0.111328125,
"learning_rate": 0.00014379084967320262,
"loss": 1.4631,
"step": 110
},
{
"epoch": 0.07547711587172992,
"grad_norm": 0.10302734375,
"learning_rate": 0.0001503267973856209,
"loss": 1.4423,
"step": 115
},
{
"epoch": 0.07875872960528339,
"grad_norm": 0.1064453125,
"learning_rate": 0.00015686274509803922,
"loss": 1.4301,
"step": 120
},
{
"epoch": 0.08204034333883688,
"grad_norm": 0.1123046875,
"learning_rate": 0.00016339869281045753,
"loss": 1.4287,
"step": 125
},
{
"epoch": 0.08532195707239035,
"grad_norm": 0.162109375,
"learning_rate": 0.00016993464052287582,
"loss": 1.4294,
"step": 130
},
{
"epoch": 0.08860357080594382,
"grad_norm": 0.1240234375,
"learning_rate": 0.00017647058823529413,
"loss": 1.4201,
"step": 135
},
{
"epoch": 0.0918851845394973,
"grad_norm": 0.1494140625,
"learning_rate": 0.00018300653594771241,
"loss": 1.4044,
"step": 140
},
{
"epoch": 0.09516679827305077,
"grad_norm": 0.1357421875,
"learning_rate": 0.00018954248366013073,
"loss": 1.4114,
"step": 145
},
{
"epoch": 0.09844841200660424,
"grad_norm": 0.1552734375,
"learning_rate": 0.000196078431372549,
"loss": 1.3928,
"step": 150
},
{
"epoch": 0.10173002574015773,
"grad_norm": 0.162109375,
"learning_rate": 0.0001999989483097553,
"loss": 1.4054,
"step": 155
},
{
"epoch": 0.1050116394737112,
"grad_norm": 0.1337890625,
"learning_rate": 0.00019998711704854725,
"loss": 1.3996,
"step": 160
},
{
"epoch": 0.10829325320726467,
"grad_norm": 0.15625,
"learning_rate": 0.0001999621414738484,
"loss": 1.4015,
"step": 165
},
{
"epoch": 0.11157486694081815,
"grad_norm": 0.1455078125,
"learning_rate": 0.0001999240248689495,
"loss": 1.387,
"step": 170
},
{
"epoch": 0.11485648067437162,
"grad_norm": 0.140625,
"learning_rate": 0.00019987277224466215,
"loss": 1.4057,
"step": 175
},
{
"epoch": 0.1181380944079251,
"grad_norm": 0.142578125,
"learning_rate": 0.00019980839033865994,
"loss": 1.3844,
"step": 180
},
{
"epoch": 0.12141970814147857,
"grad_norm": 0.146484375,
"learning_rate": 0.00019973088761459287,
"loss": 1.3654,
"step": 185
},
{
"epoch": 0.12470132187503205,
"grad_norm": 0.1591796875,
"learning_rate": 0.00019964027426097448,
"loss": 1.3632,
"step": 190
},
{
"epoch": 0.12798293560858554,
"grad_norm": 0.154296875,
"learning_rate": 0.00019953656218984263,
"loss": 1.3982,
"step": 195
},
{
"epoch": 0.131264549342139,
"grad_norm": 0.146484375,
"learning_rate": 0.0001994197650351936,
"loss": 1.3842,
"step": 200
},
{
"epoch": 0.13454616307569248,
"grad_norm": 0.1640625,
"learning_rate": 0.0001992898981511896,
"loss": 1.3528,
"step": 205
},
{
"epoch": 0.13782777680924593,
"grad_norm": 0.173828125,
"learning_rate": 0.0001991469786101404,
"loss": 1.3811,
"step": 210
},
{
"epoch": 0.14110939054279942,
"grad_norm": 0.162109375,
"learning_rate": 0.00019899102520025896,
"loss": 1.3594,
"step": 215
},
{
"epoch": 0.1443910042763529,
"grad_norm": 0.1865234375,
"learning_rate": 0.0001988220584231916,
"loss": 1.3675,
"step": 220
},
{
"epoch": 0.14767261800990636,
"grad_norm": 0.15234375,
"learning_rate": 0.00019864010049132287,
"loss": 1.3532,
"step": 225
},
{
"epoch": 0.15095423174345984,
"grad_norm": 0.1455078125,
"learning_rate": 0.0001984451753248553,
"loss": 1.3672,
"step": 230
},
{
"epoch": 0.15423584547701333,
"grad_norm": 0.1689453125,
"learning_rate": 0.0001982373085486651,
"loss": 1.3553,
"step": 235
},
{
"epoch": 0.15751745921056678,
"grad_norm": 0.1396484375,
"learning_rate": 0.00019801652748893347,
"loss": 1.3303,
"step": 240
},
{
"epoch": 0.16079907294412027,
"grad_norm": 0.1396484375,
"learning_rate": 0.00019778286116955407,
"loss": 1.35,
"step": 245
},
{
"epoch": 0.16408068667767375,
"grad_norm": 0.1484375,
"learning_rate": 0.00019753634030831782,
"loss": 1.3451,
"step": 250
},
{
"epoch": 0.1673623004112272,
"grad_norm": 0.1357421875,
"learning_rate": 0.00019727699731287465,
"loss": 1.3604,
"step": 255
},
{
"epoch": 0.1706439141447807,
"grad_norm": 0.15625,
"learning_rate": 0.00019700486627647305,
"loss": 1.3418,
"step": 260
},
{
"epoch": 0.17392552787833418,
"grad_norm": 0.171875,
"learning_rate": 0.0001967199829734784,
"loss": 1.3463,
"step": 265
},
{
"epoch": 0.17720714161188764,
"grad_norm": 0.1533203125,
"learning_rate": 0.00019642238485466989,
"loss": 1.357,
"step": 270
},
{
"epoch": 0.18048875534544112,
"grad_norm": 0.1455078125,
"learning_rate": 0.00019611211104231724,
"loss": 1.3678,
"step": 275
},
{
"epoch": 0.1837703690789946,
"grad_norm": 0.134765625,
"learning_rate": 0.0001957892023250379,
"loss": 1.3296,
"step": 280
},
{
"epoch": 0.18705198281254806,
"grad_norm": 0.154296875,
"learning_rate": 0.00019545370115243462,
"loss": 1.3445,
"step": 285
},
{
"epoch": 0.19033359654610155,
"grad_norm": 0.146484375,
"learning_rate": 0.00019510565162951537,
"loss": 1.3407,
"step": 290
},
{
"epoch": 0.19361521027965503,
"grad_norm": 0.1484375,
"learning_rate": 0.00019474509951089507,
"loss": 1.3419,
"step": 295
},
{
"epoch": 0.1968968240132085,
"grad_norm": 0.1357421875,
"learning_rate": 0.00019437209219478084,
"loss": 1.349,
"step": 300
},
{
"epoch": 0.20017843774676197,
"grad_norm": 0.1728515625,
"learning_rate": 0.00019398667871674082,
"loss": 1.3282,
"step": 305
},
{
"epoch": 0.20346005148031546,
"grad_norm": 0.1669921875,
"learning_rate": 0.00019358890974325817,
"loss": 1.3115,
"step": 310
},
{
"epoch": 0.2067416652138689,
"grad_norm": 0.1396484375,
"learning_rate": 0.00019317883756507026,
"loss": 1.339,
"step": 315
},
{
"epoch": 0.2100232789474224,
"grad_norm": 0.1435546875,
"learning_rate": 0.0001927565160902948,
"loss": 1.3186,
"step": 320
},
{
"epoch": 0.21330489268097588,
"grad_norm": 0.1494140625,
"learning_rate": 0.00019232200083734265,
"loss": 1.3434,
"step": 325
},
{
"epoch": 0.21658650641452934,
"grad_norm": 0.1455078125,
"learning_rate": 0.00019187534892761986,
"loss": 1.3186,
"step": 330
},
{
"epoch": 0.21986812014808282,
"grad_norm": 0.1435546875,
"learning_rate": 0.0001914166190780181,
"loss": 1.3394,
"step": 335
},
{
"epoch": 0.2231497338816363,
"grad_norm": 0.1416015625,
"learning_rate": 0.00019094587159319585,
"loss": 1.317,
"step": 340
},
{
"epoch": 0.22643134761518977,
"grad_norm": 0.142578125,
"learning_rate": 0.00019046316835765083,
"loss": 1.3344,
"step": 345
},
{
"epoch": 0.22971296134874325,
"grad_norm": 0.1650390625,
"learning_rate": 0.00018996857282758462,
"loss": 1.3402,
"step": 350
},
{
"epoch": 0.2329945750822967,
"grad_norm": 0.1455078125,
"learning_rate": 0.00018946215002256061,
"loss": 1.3396,
"step": 355
},
{
"epoch": 0.2362761888158502,
"grad_norm": 0.1591796875,
"learning_rate": 0.00018894396651695662,
"loss": 1.3289,
"step": 360
},
{
"epoch": 0.23955780254940368,
"grad_norm": 0.1396484375,
"learning_rate": 0.00018841409043121306,
"loss": 1.3274,
"step": 365
},
{
"epoch": 0.24283941628295713,
"grad_norm": 0.1416015625,
"learning_rate": 0.0001878725914228776,
"loss": 1.3312,
"step": 370
},
{
"epoch": 0.24612103001651062,
"grad_norm": 0.14453125,
"learning_rate": 0.00018731954067744834,
"loss": 1.3258,
"step": 375
},
{
"epoch": 0.2494026437500641,
"grad_norm": 0.150390625,
"learning_rate": 0.00018675501089901542,
"loss": 1.3224,
"step": 380
},
{
"epoch": 0.25268425748361756,
"grad_norm": 0.1474609375,
"learning_rate": 0.00018617907630070352,
"loss": 1.3219,
"step": 385
},
{
"epoch": 0.25596587121717107,
"grad_norm": 0.1337890625,
"learning_rate": 0.0001855918125949157,
"loss": 1.3191,
"step": 390
},
{
"epoch": 0.2592474849507245,
"grad_norm": 0.1435546875,
"learning_rate": 0.00018499329698338035,
"loss": 1.3183,
"step": 395
},
{
"epoch": 0.262529098684278,
"grad_norm": 0.1474609375,
"learning_rate": 0.0001843836081470022,
"loss": 1.3154,
"step": 400
},
{
"epoch": 0.2658107124178315,
"grad_norm": 0.142578125,
"learning_rate": 0.0001837628262355188,
"loss": 1.3131,
"step": 405
},
{
"epoch": 0.26909232615138495,
"grad_norm": 0.14453125,
"learning_rate": 0.00018313103285696425,
"loss": 1.3211,
"step": 410
},
{
"epoch": 0.2723739398849384,
"grad_norm": 0.150390625,
"learning_rate": 0.00018248831106694086,
"loss": 1.3082,
"step": 415
},
{
"epoch": 0.27565555361849187,
"grad_norm": 0.15234375,
"learning_rate": 0.00018183474535770068,
"loss": 1.3046,
"step": 420
},
{
"epoch": 0.2789371673520454,
"grad_norm": 0.1748046875,
"learning_rate": 0.00018117042164703814,
"loss": 1.3026,
"step": 425
},
{
"epoch": 0.28221878108559884,
"grad_norm": 0.1484375,
"learning_rate": 0.00018049542726699533,
"loss": 1.3265,
"step": 430
},
{
"epoch": 0.2855003948191523,
"grad_norm": 0.142578125,
"learning_rate": 0.00017980985095238124,
"loss": 1.2958,
"step": 435
},
{
"epoch": 0.2887820085527058,
"grad_norm": 0.1767578125,
"learning_rate": 0.00017911378282910675,
"loss": 1.311,
"step": 440
},
{
"epoch": 0.29206362228625926,
"grad_norm": 0.138671875,
"learning_rate": 0.00017840731440233674,
"loss": 1.3126,
"step": 445
},
{
"epoch": 0.2953452360198127,
"grad_norm": 0.1435546875,
"learning_rate": 0.00017769053854446053,
"loss": 1.303,
"step": 450
},
{
"epoch": 0.29862684975336623,
"grad_norm": 0.1484375,
"learning_rate": 0.00017696354948288327,
"loss": 1.3106,
"step": 455
},
{
"epoch": 0.3019084634869197,
"grad_norm": 0.14453125,
"learning_rate": 0.00017622644278763843,
"loss": 1.2993,
"step": 460
},
{
"epoch": 0.30519007722047314,
"grad_norm": 0.1455078125,
"learning_rate": 0.00017547931535882445,
"loss": 1.3031,
"step": 465
},
{
"epoch": 0.30847169095402666,
"grad_norm": 0.134765625,
"learning_rate": 0.000174722265413866,
"loss": 1.3175,
"step": 470
},
{
"epoch": 0.3117533046875801,
"grad_norm": 0.1484375,
"learning_rate": 0.0001739553924746025,
"loss": 1.3086,
"step": 475
},
{
"epoch": 0.31503491842113357,
"grad_norm": 0.1669921875,
"learning_rate": 0.0001731787973542049,
"loss": 1.3364,
"step": 480
},
{
"epoch": 0.3183165321546871,
"grad_norm": 0.150390625,
"learning_rate": 0.0001723925821439227,
"loss": 1.3103,
"step": 485
},
{
"epoch": 0.32159814588824054,
"grad_norm": 0.13671875,
"learning_rate": 0.00017159685019966316,
"loss": 1.3312,
"step": 490
},
{
"epoch": 0.324879759621794,
"grad_norm": 0.142578125,
"learning_rate": 0.00017079170612840404,
"loss": 1.3064,
"step": 495
},
{
"epoch": 0.3281613733553475,
"grad_norm": 0.1357421875,
"learning_rate": 0.00016997725577444205,
"loss": 1.3109,
"step": 500
},
{
"epoch": 0.33144298708890096,
"grad_norm": 0.1318359375,
"learning_rate": 0.0001691536062054783,
"loss": 1.3083,
"step": 505
},
{
"epoch": 0.3347246008224544,
"grad_norm": 0.1376953125,
"learning_rate": 0.0001683208656985436,
"loss": 1.2997,
"step": 510
},
{
"epoch": 0.33800621455600793,
"grad_norm": 0.140625,
"learning_rate": 0.00016747914372576393,
"loss": 1.3161,
"step": 515
},
{
"epoch": 0.3412878282895614,
"grad_norm": 0.1376953125,
"learning_rate": 0.00016662855093996945,
"loss": 1.2811,
"step": 520
},
{
"epoch": 0.34456944202311485,
"grad_norm": 0.14453125,
"learning_rate": 0.00016576919916014808,
"loss": 1.3146,
"step": 525
},
{
"epoch": 0.34785105575666836,
"grad_norm": 0.13671875,
"learning_rate": 0.00016490120135674566,
"loss": 1.2922,
"step": 530
},
{
"epoch": 0.3511326694902218,
"grad_norm": 0.146484375,
"learning_rate": 0.00016402467163681493,
"loss": 1.2962,
"step": 535
},
{
"epoch": 0.3544142832237753,
"grad_norm": 0.138671875,
"learning_rate": 0.00016313972522901491,
"loss": 1.3008,
"step": 540
},
{
"epoch": 0.3576958969573288,
"grad_norm": 0.1455078125,
"learning_rate": 0.00016224647846846315,
"loss": 1.3042,
"step": 545
},
{
"epoch": 0.36097751069088224,
"grad_norm": 0.134765625,
"learning_rate": 0.00016134504878144204,
"loss": 1.2876,
"step": 550
},
{
"epoch": 0.3642591244244357,
"grad_norm": 0.1474609375,
"learning_rate": 0.00016043555466996206,
"loss": 1.3086,
"step": 555
},
{
"epoch": 0.3675407381579892,
"grad_norm": 0.1513671875,
"learning_rate": 0.0001595181156961836,
"loss": 1.2894,
"step": 560
},
{
"epoch": 0.37082235189154267,
"grad_norm": 0.1357421875,
"learning_rate": 0.00015859285246669913,
"loss": 1.2933,
"step": 565
},
{
"epoch": 0.3741039656250961,
"grad_norm": 0.1435546875,
"learning_rate": 0.00015765988661667834,
"loss": 1.2885,
"step": 570
},
{
"epoch": 0.37738557935864964,
"grad_norm": 0.150390625,
"learning_rate": 0.00015671934079387797,
"loss": 1.2719,
"step": 575
},
{
"epoch": 0.3806671930922031,
"grad_norm": 0.1337890625,
"learning_rate": 0.00015577133864251848,
"loss": 1.3012,
"step": 580
},
{
"epoch": 0.38394880682575655,
"grad_norm": 0.1337890625,
"learning_rate": 0.00015481600478702996,
"loss": 1.3046,
"step": 585
},
{
"epoch": 0.38723042055931006,
"grad_norm": 0.142578125,
"learning_rate": 0.0001538534648156686,
"loss": 1.3017,
"step": 590
},
{
"epoch": 0.3905120342928635,
"grad_norm": 0.13671875,
"learning_rate": 0.00015288384526400734,
"loss": 1.307,
"step": 595
},
{
"epoch": 0.393793648026417,
"grad_norm": 0.134765625,
"learning_rate": 0.00015190727359830109,
"loss": 1.2755,
"step": 600
},
{
"epoch": 0.3970752617599705,
"grad_norm": 0.1357421875,
"learning_rate": 0.00015092387819873014,
"loss": 1.2961,
"step": 605
},
{
"epoch": 0.40035687549352394,
"grad_norm": 0.1416015625,
"learning_rate": 0.0001499337883425235,
"loss": 1.2989,
"step": 610
},
{
"epoch": 0.4036384892270774,
"grad_norm": 0.138671875,
"learning_rate": 0.0001489371341869638,
"loss": 1.2897,
"step": 615
},
{
"epoch": 0.4069201029606309,
"grad_norm": 0.1435546875,
"learning_rate": 0.00014793404675227684,
"loss": 1.3068,
"step": 620
},
{
"epoch": 0.41020171669418437,
"grad_norm": 0.1416015625,
"learning_rate": 0.00014692465790440792,
"loss": 1.28,
"step": 625
},
{
"epoch": 0.4134833304277378,
"grad_norm": 0.13671875,
"learning_rate": 0.0001459091003376865,
"loss": 1.274,
"step": 630
},
{
"epoch": 0.41676494416129134,
"grad_norm": 0.1474609375,
"learning_rate": 0.00014488750755738223,
"loss": 1.3007,
"step": 635
},
{
"epoch": 0.4200465578948448,
"grad_norm": 0.14453125,
"learning_rate": 0.00014386001386215434,
"loss": 1.2852,
"step": 640
},
{
"epoch": 0.42332817162839825,
"grad_norm": 0.138671875,
"learning_rate": 0.0001428267543263969,
"loss": 1.2861,
"step": 645
},
{
"epoch": 0.42660978536195177,
"grad_norm": 0.1376953125,
"learning_rate": 0.00014178786478248162,
"loss": 1.2873,
"step": 650
},
{
"epoch": 0.4298913990955052,
"grad_norm": 0.140625,
"learning_rate": 0.0001407434818029015,
"loss": 1.283,
"step": 655
},
{
"epoch": 0.4331730128290587,
"grad_norm": 0.1376953125,
"learning_rate": 0.00013969374268231713,
"loss": 1.2828,
"step": 660
},
{
"epoch": 0.4364546265626122,
"grad_norm": 0.1376953125,
"learning_rate": 0.0001386387854195076,
"loss": 1.2577,
"step": 665
},
{
"epoch": 0.43973624029616565,
"grad_norm": 0.134765625,
"learning_rate": 0.0001375787486992294,
"loss": 1.3003,
"step": 670
},
{
"epoch": 0.4430178540297191,
"grad_norm": 0.1435546875,
"learning_rate": 0.00013651377187398492,
"loss": 1.2879,
"step": 675
},
{
"epoch": 0.4462994677632726,
"grad_norm": 0.140625,
"learning_rate": 0.00013544399494570307,
"loss": 1.2947,
"step": 680
},
{
"epoch": 0.4495810814968261,
"grad_norm": 0.1376953125,
"learning_rate": 0.0001343695585473346,
"loss": 1.263,
"step": 685
},
{
"epoch": 0.45286269523037953,
"grad_norm": 0.1328125,
"learning_rate": 0.00013329060392436456,
"loss": 1.2842,
"step": 690
},
{
"epoch": 0.456144308963933,
"grad_norm": 0.14453125,
"learning_rate": 0.00013220727291624415,
"loss": 1.2789,
"step": 695
},
{
"epoch": 0.4594259226974865,
"grad_norm": 0.1396484375,
"learning_rate": 0.00013111970793774439,
"loss": 1.2638,
"step": 700
},
{
"epoch": 0.46270753643103996,
"grad_norm": 0.1318359375,
"learning_rate": 0.00013002805196023448,
"loss": 1.2978,
"step": 705
},
{
"epoch": 0.4659891501645934,
"grad_norm": 0.13671875,
"learning_rate": 0.0001289324484928865,
"loss": 1.2863,
"step": 710
},
{
"epoch": 0.4692707638981469,
"grad_norm": 0.1318359375,
"learning_rate": 0.0001278330415638099,
"loss": 1.2774,
"step": 715
},
{
"epoch": 0.4725523776317004,
"grad_norm": 0.1357421875,
"learning_rate": 0.0001267299757011175,
"loss": 1.2741,
"step": 720
},
{
"epoch": 0.47583399136525384,
"grad_norm": 0.1328125,
"learning_rate": 0.00012562339591392572,
"loss": 1.2904,
"step": 725
},
{
"epoch": 0.47911560509880735,
"grad_norm": 0.1318359375,
"learning_rate": 0.00012451344767329178,
"loss": 1.2737,
"step": 730
},
{
"epoch": 0.4823972188323608,
"grad_norm": 0.140625,
"learning_rate": 0.00012340027689309,
"loss": 1.2958,
"step": 735
},
{
"epoch": 0.48567883256591426,
"grad_norm": 0.1435546875,
"learning_rate": 0.0001222840299108301,
"loss": 1.2914,
"step": 740
},
{
"epoch": 0.4889604462994678,
"grad_norm": 0.1435546875,
"learning_rate": 0.0001211648534684194,
"loss": 1.3006,
"step": 745
},
{
"epoch": 0.49224206003302123,
"grad_norm": 0.134765625,
"learning_rate": 0.00012004289469287229,
"loss": 1.2698,
"step": 750
},
{
"epoch": 0.4955236737665747,
"grad_norm": 0.146484375,
"learning_rate": 0.00011891830107696891,
"loss": 1.2954,
"step": 755
},
{
"epoch": 0.4988052875001282,
"grad_norm": 0.138671875,
"learning_rate": 0.00011779122045986567,
"loss": 1.2682,
"step": 760
},
{
"epoch": 0.5020869012336817,
"grad_norm": 0.1337890625,
"learning_rate": 0.00011666180100766036,
"loss": 1.2779,
"step": 765
},
{
"epoch": 0.5053685149672351,
"grad_norm": 0.1318359375,
"learning_rate": 0.00011553019119391412,
"loss": 1.2848,
"step": 770
},
{
"epoch": 0.5086501287007886,
"grad_norm": 0.1376953125,
"learning_rate": 0.00011439653978013334,
"loss": 1.2788,
"step": 775
},
{
"epoch": 0.5119317424343421,
"grad_norm": 0.146484375,
"learning_rate": 0.0001132609957962131,
"loss": 1.2696,
"step": 780
},
{
"epoch": 0.5152133561678955,
"grad_norm": 0.1337890625,
"learning_rate": 0.00011212370852084603,
"loss": 1.272,
"step": 785
},
{
"epoch": 0.518494969901449,
"grad_norm": 0.13671875,
"learning_rate": 0.00011098482746189786,
"loss": 1.2893,
"step": 790
},
{
"epoch": 0.5217765836350026,
"grad_norm": 0.138671875,
"learning_rate": 0.00010984450233675334,
"loss": 1.2761,
"step": 795
},
{
"epoch": 0.525058197368556,
"grad_norm": 0.142578125,
"learning_rate": 0.0001087028830526342,
"loss": 1.2472,
"step": 800
},
{
"epoch": 0.5283398111021095,
"grad_norm": 0.1416015625,
"learning_rate": 0.00010756011968689242,
"loss": 1.2683,
"step": 805
},
{
"epoch": 0.531621424835663,
"grad_norm": 0.1396484375,
"learning_rate": 0.00010641636246728095,
"loss": 1.2535,
"step": 810
},
{
"epoch": 0.5349030385692164,
"grad_norm": 0.134765625,
"learning_rate": 0.00010527176175220499,
"loss": 1.2508,
"step": 815
},
{
"epoch": 0.5381846523027699,
"grad_norm": 0.138671875,
"learning_rate": 0.0001041264680109556,
"loss": 1.2801,
"step": 820
},
{
"epoch": 0.5414662660363233,
"grad_norm": 0.142578125,
"learning_rate": 0.00010298063180392917,
"loss": 1.2661,
"step": 825
},
{
"epoch": 0.5447478797698768,
"grad_norm": 0.134765625,
"learning_rate": 0.0001018344037628346,
"loss": 1.2817,
"step": 830
},
{
"epoch": 0.5480294935034303,
"grad_norm": 0.134765625,
"learning_rate": 0.00010068793457089141,
"loss": 1.2881,
"step": 835
},
{
"epoch": 0.5513111072369837,
"grad_norm": 0.134765625,
"learning_rate": 9.954137494302079e-05,
"loss": 1.2755,
"step": 840
},
{
"epoch": 0.5545927209705372,
"grad_norm": 0.1396484375,
"learning_rate": 9.839487560603266e-05,
"loss": 1.2741,
"step": 845
},
{
"epoch": 0.5578743347040908,
"grad_norm": 0.1357421875,
"learning_rate": 9.724858727881107e-05,
"loss": 1.304,
"step": 850
},
{
"epoch": 0.5611559484376442,
"grad_norm": 0.13671875,
"learning_rate": 9.610266065250077e-05,
"loss": 1.2742,
"step": 855
},
{
"epoch": 0.5644375621711977,
"grad_norm": 0.13671875,
"learning_rate": 9.495724637069718e-05,
"loss": 1.2988,
"step": 860
},
{
"epoch": 0.5677191759047512,
"grad_norm": 0.1337890625,
"learning_rate": 9.381249500964294e-05,
"loss": 1.2753,
"step": 865
},
{
"epoch": 0.5710007896383046,
"grad_norm": 0.1376953125,
"learning_rate": 9.266855705843309e-05,
"loss": 1.273,
"step": 870
},
{
"epoch": 0.5742824033718581,
"grad_norm": 0.13671875,
"learning_rate": 9.152558289923177e-05,
"loss": 1.2702,
"step": 875
},
{
"epoch": 0.5775640171054116,
"grad_norm": 0.1396484375,
"learning_rate": 9.038372278750287e-05,
"loss": 1.2854,
"step": 880
},
{
"epoch": 0.580845630838965,
"grad_norm": 0.140625,
"learning_rate": 8.92431268322576e-05,
"loss": 1.285,
"step": 885
},
{
"epoch": 0.5841272445725185,
"grad_norm": 0.1318359375,
"learning_rate": 8.810394497632102e-05,
"loss": 1.2743,
"step": 890
},
{
"epoch": 0.587408858306072,
"grad_norm": 0.142578125,
"learning_rate": 8.696632697662063e-05,
"loss": 1.2741,
"step": 895
},
{
"epoch": 0.5906904720396254,
"grad_norm": 0.13671875,
"learning_rate": 8.58304223844993e-05,
"loss": 1.2847,
"step": 900
},
{
"epoch": 0.593972085773179,
"grad_norm": 0.1376953125,
"learning_rate": 8.469638052605513e-05,
"loss": 1.2753,
"step": 905
},
{
"epoch": 0.5972536995067325,
"grad_norm": 0.138671875,
"learning_rate": 8.356435048251126e-05,
"loss": 1.2679,
"step": 910
},
{
"epoch": 0.6005353132402859,
"grad_norm": 0.1650390625,
"learning_rate": 8.243448107061729e-05,
"loss": 1.2631,
"step": 915
},
{
"epoch": 0.6038169269738394,
"grad_norm": 0.134765625,
"learning_rate": 8.130692082308624e-05,
"loss": 1.2655,
"step": 920
},
{
"epoch": 0.6070985407073929,
"grad_norm": 0.130859375,
"learning_rate": 8.01818179690681e-05,
"loss": 1.3186,
"step": 925
},
{
"epoch": 0.6103801544409463,
"grad_norm": 0.1337890625,
"learning_rate": 7.90593204146638e-05,
"loss": 1.2895,
"step": 930
},
{
"epoch": 0.6136617681744998,
"grad_norm": 0.1337890625,
"learning_rate": 7.793957572348131e-05,
"loss": 1.2751,
"step": 935
},
{
"epoch": 0.6169433819080533,
"grad_norm": 0.13671875,
"learning_rate": 7.682273109723712e-05,
"loss": 1.2663,
"step": 940
},
{
"epoch": 0.6202249956416067,
"grad_norm": 0.1357421875,
"learning_rate": 7.570893335640487e-05,
"loss": 1.2706,
"step": 945
},
{
"epoch": 0.6235066093751602,
"grad_norm": 0.13671875,
"learning_rate": 7.459832892091455e-05,
"loss": 1.2638,
"step": 950
},
{
"epoch": 0.6267882231087137,
"grad_norm": 0.134765625,
"learning_rate": 7.349106379090381e-05,
"loss": 1.275,
"step": 955
},
{
"epoch": 0.6300698368422671,
"grad_norm": 0.1328125,
"learning_rate": 7.23872835275252e-05,
"loss": 1.272,
"step": 960
},
{
"epoch": 0.6333514505758207,
"grad_norm": 0.1337890625,
"learning_rate": 7.128713323381032e-05,
"loss": 1.2768,
"step": 965
},
{
"epoch": 0.6366330643093742,
"grad_norm": 0.1318359375,
"learning_rate": 7.019075753559468e-05,
"loss": 1.2743,
"step": 970
},
{
"epoch": 0.6399146780429276,
"grad_norm": 0.13671875,
"learning_rate": 6.909830056250527e-05,
"loss": 1.2707,
"step": 975
},
{
"epoch": 0.6431962917764811,
"grad_norm": 0.1337890625,
"learning_rate": 6.800990592901315e-05,
"loss": 1.2844,
"step": 980
},
{
"epoch": 0.6464779055100346,
"grad_norm": 0.1376953125,
"learning_rate": 6.692571671555398e-05,
"loss": 1.264,
"step": 985
},
{
"epoch": 0.649759519243588,
"grad_norm": 0.1435546875,
"learning_rate": 6.584587544971854e-05,
"loss": 1.2481,
"step": 990
},
{
"epoch": 0.6530411329771415,
"grad_norm": 0.130859375,
"learning_rate": 6.477052408751616e-05,
"loss": 1.2738,
"step": 995
},
{
"epoch": 0.656322746710695,
"grad_norm": 0.1337890625,
"learning_rate": 6.369980399471306e-05,
"loss": 1.2806,
"step": 1000
},
{
"epoch": 0.6596043604442484,
"grad_norm": 0.13671875,
"learning_rate": 6.263385592824857e-05,
"loss": 1.2911,
"step": 1005
},
{
"epoch": 0.6628859741778019,
"grad_norm": 0.134765625,
"learning_rate": 6.157282001773095e-05,
"loss": 1.2794,
"step": 1010
},
{
"epoch": 0.6661675879113554,
"grad_norm": 0.134765625,
"learning_rate": 6.051683574701616e-05,
"loss": 1.2664,
"step": 1015
},
{
"epoch": 0.6694492016449088,
"grad_norm": 0.142578125,
"learning_rate": 5.946604193587134e-05,
"loss": 1.2674,
"step": 1020
},
{
"epoch": 0.6727308153784624,
"grad_norm": 0.1357421875,
"learning_rate": 5.842057672172525e-05,
"loss": 1.2696,
"step": 1025
},
{
"epoch": 0.6760124291120159,
"grad_norm": 0.138671875,
"learning_rate": 5.738057754150905e-05,
"loss": 1.2657,
"step": 1030
},
{
"epoch": 0.6792940428455693,
"grad_norm": 0.1318359375,
"learning_rate": 5.634618111358865e-05,
"loss": 1.2726,
"step": 1035
},
{
"epoch": 0.6825756565791228,
"grad_norm": 0.134765625,
"learning_rate": 5.531752341979173e-05,
"loss": 1.2842,
"step": 1040
},
{
"epoch": 0.6858572703126763,
"grad_norm": 0.134765625,
"learning_rate": 5.429473968753157e-05,
"loss": 1.265,
"step": 1045
},
{
"epoch": 0.6891388840462297,
"grad_norm": 0.13671875,
"learning_rate": 5.327796437203019e-05,
"loss": 1.2795,
"step": 1050
},
{
"epoch": 0.6924204977797832,
"grad_norm": 0.1328125,
"learning_rate": 5.226733113864242e-05,
"loss": 1.2817,
"step": 1055
},
{
"epoch": 0.6957021115133367,
"grad_norm": 0.13671875,
"learning_rate": 5.126297284528485e-05,
"loss": 1.2538,
"step": 1060
},
{
"epoch": 0.6989837252468901,
"grad_norm": 0.1416015625,
"learning_rate": 5.0265021524969857e-05,
"loss": 1.2608,
"step": 1065
},
{
"epoch": 0.7022653389804436,
"grad_norm": 0.1318359375,
"learning_rate": 4.927360836844868e-05,
"loss": 1.2743,
"step": 1070
},
{
"epoch": 0.7055469527139971,
"grad_norm": 0.1357421875,
"learning_rate": 4.82888637069651e-05,
"loss": 1.2725,
"step": 1075
},
{
"epoch": 0.7088285664475505,
"grad_norm": 0.1328125,
"learning_rate": 4.731091699512215e-05,
"loss": 1.2578,
"step": 1080
},
{
"epoch": 0.7121101801811041,
"grad_norm": 0.1357421875,
"learning_rate": 4.6339896793863804e-05,
"loss": 1.2784,
"step": 1085
},
{
"epoch": 0.7153917939146576,
"grad_norm": 0.13671875,
"learning_rate": 4.537593075357451e-05,
"loss": 1.2708,
"step": 1090
},
{
"epoch": 0.718673407648211,
"grad_norm": 0.13671875,
"learning_rate": 4.441914559729825e-05,
"loss": 1.2797,
"step": 1095
},
{
"epoch": 0.7219550213817645,
"grad_norm": 0.13671875,
"learning_rate": 4.346966710407937e-05,
"loss": 1.3013,
"step": 1100
},
{
"epoch": 0.725236635115318,
"grad_norm": 0.134765625,
"learning_rate": 4.2527620092428e-05,
"loss": 1.2535,
"step": 1105
},
{
"epoch": 0.7285182488488714,
"grad_norm": 0.1337890625,
"learning_rate": 4.159312840391086e-05,
"loss": 1.2779,
"step": 1110
},
{
"epoch": 0.7317998625824249,
"grad_norm": 0.134765625,
"learning_rate": 4.066631488687166e-05,
"loss": 1.2659,
"step": 1115
},
{
"epoch": 0.7350814763159784,
"grad_norm": 0.1337890625,
"learning_rate": 3.974730138028095e-05,
"loss": 1.2653,
"step": 1120
},
{
"epoch": 0.7383630900495318,
"grad_norm": 0.130859375,
"learning_rate": 3.883620869771943e-05,
"loss": 1.2735,
"step": 1125
},
{
"epoch": 0.7416447037830853,
"grad_norm": 0.134765625,
"learning_rate": 3.79331566114957e-05,
"loss": 1.2653,
"step": 1130
},
{
"epoch": 0.7449263175166388,
"grad_norm": 0.134765625,
"learning_rate": 3.703826383690099e-05,
"loss": 1.262,
"step": 1135
},
{
"epoch": 0.7482079312501922,
"grad_norm": 0.1328125,
"learning_rate": 3.6151648016602794e-05,
"loss": 1.2491,
"step": 1140
},
{
"epoch": 0.7514895449837458,
"grad_norm": 0.1357421875,
"learning_rate": 3.527342570517975e-05,
"loss": 1.2551,
"step": 1145
},
{
"epoch": 0.7547711587172993,
"grad_norm": 0.1328125,
"learning_rate": 3.44037123537991e-05,
"loss": 1.2605,
"step": 1150
},
{
"epoch": 0.7580527724508527,
"grad_norm": 0.134765625,
"learning_rate": 3.3542622295039593e-05,
"loss": 1.2621,
"step": 1155
},
{
"epoch": 0.7613343861844062,
"grad_norm": 0.12890625,
"learning_rate": 3.269026872786145e-05,
"loss": 1.2798,
"step": 1160
},
{
"epoch": 0.7646159999179597,
"grad_norm": 0.1318359375,
"learning_rate": 3.184676370272488e-05,
"loss": 1.2823,
"step": 1165
},
{
"epoch": 0.7678976136515131,
"grad_norm": 0.1318359375,
"learning_rate": 3.1012218106860345e-05,
"loss": 1.284,
"step": 1170
},
{
"epoch": 0.7711792273850666,
"grad_norm": 0.1357421875,
"learning_rate": 3.0186741649690963e-05,
"loss": 1.2825,
"step": 1175
},
{
"epoch": 0.7744608411186201,
"grad_norm": 0.1337890625,
"learning_rate": 2.937044284841026e-05,
"loss": 1.2561,
"step": 1180
},
{
"epoch": 0.7777424548521735,
"grad_norm": 0.1337890625,
"learning_rate": 2.8563429013716514e-05,
"loss": 1.2587,
"step": 1185
},
{
"epoch": 0.781024068585727,
"grad_norm": 0.1328125,
"learning_rate": 2.7765806235705594e-05,
"loss": 1.2545,
"step": 1190
},
{
"epoch": 0.7843056823192806,
"grad_norm": 0.1337890625,
"learning_rate": 2.6977679369924357e-05,
"loss": 1.2553,
"step": 1195
},
{
"epoch": 0.787587296052834,
"grad_norm": 0.1337890625,
"learning_rate": 2.6199152023586503e-05,
"loss": 1.2713,
"step": 1200
},
{
"epoch": 0.7908689097863875,
"grad_norm": 0.1318359375,
"learning_rate": 2.5430326541952087e-05,
"loss": 1.2593,
"step": 1205
},
{
"epoch": 0.794150523519941,
"grad_norm": 0.13671875,
"learning_rate": 2.4671303994873373e-05,
"loss": 1.2509,
"step": 1210
},
{
"epoch": 0.7974321372534944,
"grad_norm": 0.1328125,
"learning_rate": 2.3922184163508254e-05,
"loss": 1.2682,
"step": 1215
},
{
"epoch": 0.8007137509870479,
"grad_norm": 0.134765625,
"learning_rate": 2.3183065527202718e-05,
"loss": 1.2596,
"step": 1220
},
{
"epoch": 0.8039953647206014,
"grad_norm": 0.1337890625,
"learning_rate": 2.245404525054515e-05,
"loss": 1.2634,
"step": 1225
},
{
"epoch": 0.8072769784541548,
"grad_norm": 0.130859375,
"learning_rate": 2.1735219170592734e-05,
"loss": 1.2717,
"step": 1230
},
{
"epoch": 0.8105585921877083,
"grad_norm": 0.1357421875,
"learning_rate": 2.1026681784272872e-05,
"loss": 1.2607,
"step": 1235
},
{
"epoch": 0.8138402059212618,
"grad_norm": 0.1337890625,
"learning_rate": 2.0328526235960565e-05,
"loss": 1.2733,
"step": 1240
},
{
"epoch": 0.8171218196548152,
"grad_norm": 0.1318359375,
"learning_rate": 1.9640844305233642e-05,
"loss": 1.2696,
"step": 1245
},
{
"epoch": 0.8204034333883687,
"grad_norm": 0.1318359375,
"learning_rate": 1.8963726394807424e-05,
"loss": 1.2779,
"step": 1250
},
{
"epoch": 0.8236850471219223,
"grad_norm": 0.1328125,
"learning_rate": 1.8297261518650456e-05,
"loss": 1.2668,
"step": 1255
},
{
"epoch": 0.8269666608554757,
"grad_norm": 0.130859375,
"learning_rate": 1.7641537290282472e-05,
"loss": 1.2646,
"step": 1260
},
{
"epoch": 0.8302482745890292,
"grad_norm": 0.130859375,
"learning_rate": 1.699663991125705e-05,
"loss": 1.2696,
"step": 1265
},
{
"epoch": 0.8335298883225827,
"grad_norm": 0.12890625,
"learning_rate": 1.636265415982936e-05,
"loss": 1.2604,
"step": 1270
},
{
"epoch": 0.8368115020561361,
"grad_norm": 0.1328125,
"learning_rate": 1.5739663379811122e-05,
"loss": 1.2664,
"step": 1275
},
{
"epoch": 0.8400931157896896,
"grad_norm": 0.130859375,
"learning_rate": 1.512774946961445e-05,
"loss": 1.2804,
"step": 1280
},
{
"epoch": 0.8433747295232431,
"grad_norm": 0.1337890625,
"learning_rate": 1.4526992871485345e-05,
"loss": 1.2641,
"step": 1285
},
{
"epoch": 0.8466563432567965,
"grad_norm": 0.1298828125,
"learning_rate": 1.3937472560928733e-05,
"loss": 1.2795,
"step": 1290
},
{
"epoch": 0.84993795699035,
"grad_norm": 0.1318359375,
"learning_rate": 1.3359266036326412e-05,
"loss": 1.2659,
"step": 1295
},
{
"epoch": 0.8532195707239035,
"grad_norm": 0.1337890625,
"learning_rate": 1.2792449308749076e-05,
"loss": 1.2643,
"step": 1300
},
{
"epoch": 0.8565011844574569,
"grad_norm": 0.1298828125,
"learning_rate": 1.2237096891963862e-05,
"loss": 1.2812,
"step": 1305
},
{
"epoch": 0.8597827981910104,
"grad_norm": 0.1298828125,
"learning_rate": 1.1693281792638877e-05,
"loss": 1.2669,
"step": 1310
},
{
"epoch": 0.863064411924564,
"grad_norm": 0.1298828125,
"learning_rate": 1.1161075500745543e-05,
"loss": 1.2734,
"step": 1315
},
{
"epoch": 0.8663460256581174,
"grad_norm": 0.1318359375,
"learning_rate": 1.0640547980160742e-05,
"loss": 1.2607,
"step": 1320
},
{
"epoch": 0.8696276393916709,
"grad_norm": 0.1298828125,
"learning_rate": 1.0131767659469205e-05,
"loss": 1.2717,
"step": 1325
},
{
"epoch": 0.8729092531252244,
"grad_norm": 0.1376953125,
"learning_rate": 9.634801422967887e-06,
"loss": 1.2767,
"step": 1330
},
{
"epoch": 0.8761908668587778,
"grad_norm": 0.1337890625,
"learning_rate": 9.149714601873516e-06,
"loss": 1.274,
"step": 1335
},
{
"epoch": 0.8794724805923313,
"grad_norm": 0.1328125,
"learning_rate": 8.67657096573391e-06,
"loss": 1.2553,
"step": 1340
},
{
"epoch": 0.8827540943258848,
"grad_norm": 0.1279296875,
"learning_rate": 8.215432714045024e-06,
"loss": 1.2758,
"step": 1345
},
{
"epoch": 0.8860357080594382,
"grad_norm": 0.1279296875,
"learning_rate": 7.766360468074074e-06,
"loss": 1.288,
"step": 1350
},
{
"epoch": 0.8893173217929917,
"grad_norm": 0.1318359375,
"learning_rate": 7.32941326289035e-06,
"loss": 1.2421,
"step": 1355
},
{
"epoch": 0.8925989355265452,
"grad_norm": 0.130859375,
"learning_rate": 6.904648539604364e-06,
"loss": 1.2517,
"step": 1360
},
{
"epoch": 0.8958805492600986,
"grad_norm": 0.1328125,
"learning_rate": 6.4921221378167915e-06,
"loss": 1.2712,
"step": 1365
},
{
"epoch": 0.8991621629936521,
"grad_norm": 0.12890625,
"learning_rate": 6.091888288277569e-06,
"loss": 1.264,
"step": 1370
},
{
"epoch": 0.9024437767272055,
"grad_norm": 0.1328125,
"learning_rate": 5.70399960575696e-06,
"loss": 1.2713,
"step": 1375
},
{
"epoch": 0.9057253904607591,
"grad_norm": 0.1337890625,
"learning_rate": 5.328507082128642e-06,
"loss": 1.272,
"step": 1380
},
{
"epoch": 0.9090070041943126,
"grad_norm": 0.12890625,
"learning_rate": 4.965460079666362e-06,
"loss": 1.2672,
"step": 1385
},
{
"epoch": 0.912288617927866,
"grad_norm": 0.1298828125,
"learning_rate": 4.61490632455478e-06,
"loss": 1.2732,
"step": 1390
},
{
"epoch": 0.9155702316614195,
"grad_norm": 0.130859375,
"learning_rate": 4.2768919006153876e-06,
"loss": 1.2467,
"step": 1395
},
{
"epoch": 0.918851845394973,
"grad_norm": 0.1328125,
"learning_rate": 3.951461243248311e-06,
"loss": 1.2634,
"step": 1400
},
{
"epoch": 0.9221334591285264,
"grad_norm": 0.130859375,
"learning_rate": 3.638657133590817e-06,
"loss": 1.2571,
"step": 1405
},
{
"epoch": 0.9254150728620799,
"grad_norm": 0.134765625,
"learning_rate": 3.3385206928933097e-06,
"loss": 1.2528,
"step": 1410
},
{
"epoch": 0.9286966865956334,
"grad_norm": 0.130859375,
"learning_rate": 3.0510913771135463e-06,
"loss": 1.2647,
"step": 1415
},
{
"epoch": 0.9319783003291868,
"grad_norm": 0.1337890625,
"learning_rate": 2.7764069717297724e-06,
"loss": 1.2769,
"step": 1420
},
{
"epoch": 0.9352599140627403,
"grad_norm": 0.1318359375,
"learning_rate": 2.5145035867733312e-06,
"loss": 1.2616,
"step": 1425
},
{
"epoch": 0.9385415277962939,
"grad_norm": 0.1357421875,
"learning_rate": 2.265415652081804e-06,
"loss": 1.2698,
"step": 1430
},
{
"epoch": 0.9418231415298473,
"grad_norm": 0.1337890625,
"learning_rate": 2.0291759127727294e-06,
"loss": 1.2415,
"step": 1435
},
{
"epoch": 0.9451047552634008,
"grad_norm": 0.1279296875,
"learning_rate": 1.8058154249389502e-06,
"loss": 1.2907,
"step": 1440
},
{
"epoch": 0.9483863689969543,
"grad_norm": 0.130859375,
"learning_rate": 1.5953635515660425e-06,
"loss": 1.2786,
"step": 1445
},
{
"epoch": 0.9516679827305077,
"grad_norm": 0.1337890625,
"learning_rate": 1.3978479586721716e-06,
"loss": 1.2634,
"step": 1450
},
{
"epoch": 0.9549495964640612,
"grad_norm": 0.1337890625,
"learning_rate": 1.2132946116711897e-06,
"loss": 1.2866,
"step": 1455
},
{
"epoch": 0.9582312101976147,
"grad_norm": 0.12890625,
"learning_rate": 1.0417277719591667e-06,
"loss": 1.2671,
"step": 1460
},
{
"epoch": 0.9615128239311681,
"grad_norm": 0.1328125,
"learning_rate": 8.831699937249859e-07,
"loss": 1.251,
"step": 1465
},
{
"epoch": 0.9647944376647216,
"grad_norm": 0.1318359375,
"learning_rate": 7.376421209854267e-07,
"loss": 1.2793,
"step": 1470
},
{
"epoch": 0.9680760513982751,
"grad_norm": 0.126953125,
"learning_rate": 6.051632848449562e-07,
"loss": 1.2684,
"step": 1475
},
{
"epoch": 0.9713576651318285,
"grad_norm": 0.1318359375,
"learning_rate": 4.857509009807304e-07,
"loss": 1.2605,
"step": 1480
},
{
"epoch": 0.974639278865382,
"grad_norm": 0.130859375,
"learning_rate": 3.7942066735321414e-07,
"loss": 1.2608,
"step": 1485
},
{
"epoch": 0.9779208925989356,
"grad_norm": 0.1318359375,
"learning_rate": 2.861865621424431e-07,
"loss": 1.2735,
"step": 1490
},
{
"epoch": 0.981202506332489,
"grad_norm": 0.130859375,
"learning_rate": 2.060608419105048e-07,
"loss": 1.2788,
"step": 1495
},
{
"epoch": 0.9844841200660425,
"grad_norm": 0.2451171875,
"learning_rate": 1.3905403999024957e-07,
"loss": 1.264,
"step": 1500
},
{
"epoch": 0.987765733799596,
"grad_norm": 0.130859375,
"learning_rate": 8.517496510059841e-08,
"loss": 1.2673,
"step": 1505
},
{
"epoch": 0.9910473475331494,
"grad_norm": 0.12890625,
"learning_rate": 4.4430700188569095e-08,
"loss": 1.2753,
"step": 1510
},
{
"epoch": 0.9943289612667029,
"grad_norm": 0.130859375,
"learning_rate": 1.6826601498098894e-08,
"loss": 1.2567,
"step": 1515
},
{
"epoch": 0.9976105750002564,
"grad_norm": 0.1337890625,
"learning_rate": 2.3662978659633183e-09,
"loss": 1.2568,
"step": 1520
},
{
"epoch": 0.9995795432403884,
"eval_loss": 1.4362765550613403,
"eval_runtime": 1174.833,
"eval_samples_per_second": 12.068,
"eval_steps_per_second": 12.068,
"step": 1523
},
{
"epoch": 0.9995795432403884,
"step": 1523,
"total_flos": 2.6010044317889987e+18,
"train_loss": 1.1185581020360233,
"train_runtime": 52635.226,
"train_samples_per_second": 3.705,
"train_steps_per_second": 0.029
}
],
"logging_steps": 5,
"max_steps": 1523,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 20,
"total_flos": 2.6010044317889987e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}