| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 4.0, |
| "eval_steps": 500, |
| "global_step": 420, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.047619047619047616, |
| "grad_norm": 1.4257669918070304, |
| "learning_rate": 5.9523809523809525e-06, |
| "loss": 0.5346, |
| "mean_token_accuracy": 0.8659344732761383, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.09523809523809523, |
| "grad_norm": 0.7751540276308181, |
| "learning_rate": 1.1904761904761905e-05, |
| "loss": 0.4535, |
| "mean_token_accuracy": 0.8758046269416809, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.14285714285714285, |
| "grad_norm": 0.3688156015884892, |
| "learning_rate": 1.785714285714286e-05, |
| "loss": 0.4071, |
| "mean_token_accuracy": 0.8853001713752746, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.19047619047619047, |
| "grad_norm": 0.2571090114484444, |
| "learning_rate": 2.380952380952381e-05, |
| "loss": 0.3811, |
| "mean_token_accuracy": 0.8898558735847473, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.23809523809523808, |
| "grad_norm": 0.2445335136114927, |
| "learning_rate": 2.9761904761904762e-05, |
| "loss": 0.3714, |
| "mean_token_accuracy": 0.8922576308250427, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.2857142857142857, |
| "grad_norm": 0.25497240173448055, |
| "learning_rate": 3.571428571428572e-05, |
| "loss": 0.3621, |
| "mean_token_accuracy": 0.8938062608242034, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.3333333333333333, |
| "grad_norm": 0.22758565353000912, |
| "learning_rate": 4.166666666666667e-05, |
| "loss": 0.3572, |
| "mean_token_accuracy": 0.8950600028038025, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.38095238095238093, |
| "grad_norm": 0.21108211135784025, |
| "learning_rate": 4.761904761904762e-05, |
| "loss": 0.3463, |
| "mean_token_accuracy": 0.8972749292850495, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.42857142857142855, |
| "grad_norm": 0.2207672207804811, |
| "learning_rate": 4.999300659501837e-05, |
| "loss": 0.3443, |
| "mean_token_accuracy": 0.8975539267063141, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.47619047619047616, |
| "grad_norm": 0.21926886992248296, |
| "learning_rate": 4.995028486261366e-05, |
| "loss": 0.3378, |
| "mean_token_accuracy": 0.899033111333847, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.5238095238095238, |
| "grad_norm": 0.224235174049935, |
| "learning_rate": 4.986880029700503e-05, |
| "loss": 0.3403, |
| "mean_token_accuracy": 0.8979385972023011, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.5714285714285714, |
| "grad_norm": 0.21434998232271477, |
| "learning_rate": 4.974869359006539e-05, |
| "loss": 0.3297, |
| "mean_token_accuracy": 0.9011363804340362, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.6190476190476191, |
| "grad_norm": 0.22126533026421416, |
| "learning_rate": 4.959017211895173e-05, |
| "loss": 0.3329, |
| "mean_token_accuracy": 0.8999340653419494, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.6666666666666666, |
| "grad_norm": 0.2122647717862151, |
| "learning_rate": 4.9393509588046036e-05, |
| "loss": 0.3306, |
| "mean_token_accuracy": 0.9009318828582764, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.7142857142857143, |
| "grad_norm": 0.20764297035844217, |
| "learning_rate": 4.915904555637527e-05, |
| "loss": 0.33, |
| "mean_token_accuracy": 0.9004346609115601, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.7619047619047619, |
| "grad_norm": 0.20816077188373128, |
| "learning_rate": 4.8887184851326155e-05, |
| "loss": 0.3202, |
| "mean_token_accuracy": 0.9034851253032684, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.8095238095238095, |
| "grad_norm": 0.22148078953488418, |
| "learning_rate": 4.8578396869667095e-05, |
| "loss": 0.3252, |
| "mean_token_accuracy": 0.9017885386943817, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.8571428571428571, |
| "grad_norm": 0.21406123617550696, |
| "learning_rate": 4.823321476708417e-05, |
| "loss": 0.3207, |
| "mean_token_accuracy": 0.9030977070331574, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.9047619047619048, |
| "grad_norm": 0.20063414494959075, |
| "learning_rate": 4.7852234537630435e-05, |
| "loss": 0.3151, |
| "mean_token_accuracy": 0.9040939033031463, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.9523809523809523, |
| "grad_norm": 0.20157178980104304, |
| "learning_rate": 4.7436113984678024e-05, |
| "loss": 0.3133, |
| "mean_token_accuracy": 0.9044515311717987, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.21946571056086206, |
| "learning_rate": 4.6985571585149876e-05, |
| "loss": 0.3088, |
| "mean_token_accuracy": 0.9057298839092255, |
| "step": 105 |
| }, |
| { |
| "epoch": 1.0476190476190477, |
| "grad_norm": 0.22243550466680892, |
| "learning_rate": 4.650138524899193e-05, |
| "loss": 0.2589, |
| "mean_token_accuracy": 0.9184723615646362, |
| "step": 110 |
| }, |
| { |
| "epoch": 1.0952380952380953, |
| "grad_norm": 0.21782596690450573, |
| "learning_rate": 4.5984390976027946e-05, |
| "loss": 0.2467, |
| "mean_token_accuracy": 0.9217934429645538, |
| "step": 115 |
| }, |
| { |
| "epoch": 1.1428571428571428, |
| "grad_norm": 0.20047942498513227, |
| "learning_rate": 4.5435481412515755e-05, |
| "loss": 0.2458, |
| "mean_token_accuracy": 0.9213689804077149, |
| "step": 120 |
| }, |
| { |
| "epoch": 1.1904761904761905, |
| "grad_norm": 0.20323781517968, |
| "learning_rate": 4.4855604309897496e-05, |
| "loss": 0.2533, |
| "mean_token_accuracy": 0.9193067252635956, |
| "step": 125 |
| }, |
| { |
| "epoch": 1.2380952380952381, |
| "grad_norm": 0.2089213813676996, |
| "learning_rate": 4.4245760888404665e-05, |
| "loss": 0.2451, |
| "mean_token_accuracy": 0.921397477388382, |
| "step": 130 |
| }, |
| { |
| "epoch": 1.2857142857142856, |
| "grad_norm": 0.20871576941556397, |
| "learning_rate": 4.360700410834367e-05, |
| "loss": 0.2487, |
| "mean_token_accuracy": 0.9207526803016662, |
| "step": 135 |
| }, |
| { |
| "epoch": 1.3333333333333333, |
| "grad_norm": 0.20056126493311283, |
| "learning_rate": 4.294043685204651e-05, |
| "loss": 0.2361, |
| "mean_token_accuracy": 0.924414473772049, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.380952380952381, |
| "grad_norm": 0.2115275835238231, |
| "learning_rate": 4.224721001962573e-05, |
| "loss": 0.2409, |
| "mean_token_accuracy": 0.9230122208595276, |
| "step": 145 |
| }, |
| { |
| "epoch": 1.4285714285714286, |
| "grad_norm": 0.20541500240177765, |
| "learning_rate": 4.1528520541821506e-05, |
| "loss": 0.2398, |
| "mean_token_accuracy": 0.9232278406620026, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.4761904761904763, |
| "grad_norm": 0.20798560704125774, |
| "learning_rate": 4.078560931337187e-05, |
| "loss": 0.2471, |
| "mean_token_accuracy": 0.9215823113918304, |
| "step": 155 |
| }, |
| { |
| "epoch": 1.5238095238095237, |
| "grad_norm": 0.1939929705798728, |
| "learning_rate": 4.001975905047442e-05, |
| "loss": 0.2401, |
| "mean_token_accuracy": 0.923470401763916, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.5714285714285714, |
| "grad_norm": 0.21344717934526472, |
| "learning_rate": 3.923229207603871e-05, |
| "loss": 0.2396, |
| "mean_token_accuracy": 0.9238272488117218, |
| "step": 165 |
| }, |
| { |
| "epoch": 1.619047619047619, |
| "grad_norm": 0.2191275580698787, |
| "learning_rate": 3.842456803655342e-05, |
| "loss": 0.2406, |
| "mean_token_accuracy": 0.923265916109085, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.6666666666666665, |
| "grad_norm": 0.19421469398044988, |
| "learning_rate": 3.75979815545104e-05, |
| "loss": 0.2359, |
| "mean_token_accuracy": 0.924458909034729, |
| "step": 175 |
| }, |
| { |
| "epoch": 1.7142857142857144, |
| "grad_norm": 0.19470949781451338, |
| "learning_rate": 3.6753959820438764e-05, |
| "loss": 0.2321, |
| "mean_token_accuracy": 0.9256431758403778, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.7619047619047619, |
| "grad_norm": 0.1876708972166408, |
| "learning_rate": 3.589396012870687e-05, |
| "loss": 0.2291, |
| "mean_token_accuracy": 0.9266167640686035, |
| "step": 185 |
| }, |
| { |
| "epoch": 1.8095238095238095, |
| "grad_norm": 0.2147218447176946, |
| "learning_rate": 3.5019467361346724e-05, |
| "loss": 0.2329, |
| "mean_token_accuracy": 0.925212299823761, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.8571428571428572, |
| "grad_norm": 0.19962238819866412, |
| "learning_rate": 3.413199142424535e-05, |
| "loss": 0.2296, |
| "mean_token_accuracy": 0.9259015142917633, |
| "step": 195 |
| }, |
| { |
| "epoch": 1.9047619047619047, |
| "grad_norm": 0.19475591470139603, |
| "learning_rate": 3.3233064640129734e-05, |
| "loss": 0.2277, |
| "mean_token_accuracy": 0.92720667719841, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.9523809523809523, |
| "grad_norm": 0.20861501443310124, |
| "learning_rate": 3.232423910284672e-05, |
| "loss": 0.2284, |
| "mean_token_accuracy": 0.9266561150550843, |
| "step": 205 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 0.19761114102987562, |
| "learning_rate": 3.140708399750594e-05, |
| "loss": 0.2291, |
| "mean_token_accuracy": 0.9258988976478577, |
| "step": 210 |
| }, |
| { |
| "epoch": 2.0476190476190474, |
| "grad_norm": 0.24523566793415144, |
| "learning_rate": 3.048318289111279e-05, |
| "loss": 0.1563, |
| "mean_token_accuracy": 0.9481804072856903, |
| "step": 215 |
| }, |
| { |
| "epoch": 2.0952380952380953, |
| "grad_norm": 0.204067402376235, |
| "learning_rate": 2.955413099836959e-05, |
| "loss": 0.1542, |
| "mean_token_accuracy": 0.9486050844192505, |
| "step": 220 |
| }, |
| { |
| "epoch": 2.142857142857143, |
| "grad_norm": 0.19593572116438304, |
| "learning_rate": 2.8621532427365687e-05, |
| "loss": 0.1553, |
| "mean_token_accuracy": 0.9480521976947784, |
| "step": 225 |
| }, |
| { |
| "epoch": 2.1904761904761907, |
| "grad_norm": 0.19800587405132705, |
| "learning_rate": 2.7686997409912192e-05, |
| "loss": 0.1566, |
| "mean_token_accuracy": 0.9476682603359222, |
| "step": 230 |
| }, |
| { |
| "epoch": 2.238095238095238, |
| "grad_norm": 0.19599017566764934, |
| "learning_rate": 2.6752139521303403e-05, |
| "loss": 0.1525, |
| "mean_token_accuracy": 0.9489890992641449, |
| "step": 235 |
| }, |
| { |
| "epoch": 2.2857142857142856, |
| "grad_norm": 0.1957327582988075, |
| "learning_rate": 2.5818572894305453e-05, |
| "loss": 0.1515, |
| "mean_token_accuracy": 0.9491177141666413, |
| "step": 240 |
| }, |
| { |
| "epoch": 2.3333333333333335, |
| "grad_norm": 0.19017281050653162, |
| "learning_rate": 2.4887909432182316e-05, |
| "loss": 0.1512, |
| "mean_token_accuracy": 0.9494254469871521, |
| "step": 245 |
| }, |
| { |
| "epoch": 2.380952380952381, |
| "grad_norm": 0.19356138909333898, |
| "learning_rate": 2.3961756025571336e-05, |
| "loss": 0.1535, |
| "mean_token_accuracy": 0.9488085567951202, |
| "step": 250 |
| }, |
| { |
| "epoch": 2.4285714285714284, |
| "grad_norm": 0.21519917588614698, |
| "learning_rate": 2.304171177801356e-05, |
| "loss": 0.1521, |
| "mean_token_accuracy": 0.9488667666912078, |
| "step": 255 |
| }, |
| { |
| "epoch": 2.4761904761904763, |
| "grad_norm": 0.18356462457323924, |
| "learning_rate": 2.2129365244929402e-05, |
| "loss": 0.1503, |
| "mean_token_accuracy": 0.9497200846672058, |
| "step": 260 |
| }, |
| { |
| "epoch": 2.5238095238095237, |
| "grad_norm": 0.19727737940368512, |
| "learning_rate": 2.1226291690806715e-05, |
| "loss": 0.1502, |
| "mean_token_accuracy": 0.9498027801513672, |
| "step": 265 |
| }, |
| { |
| "epoch": 2.571428571428571, |
| "grad_norm": 0.18491161543383636, |
| "learning_rate": 2.0334050369337104e-05, |
| "loss": 0.1493, |
| "mean_token_accuracy": 0.9500907003879547, |
| "step": 270 |
| }, |
| { |
| "epoch": 2.619047619047619, |
| "grad_norm": 0.180689936763116, |
| "learning_rate": 1.945418183119656e-05, |
| "loss": 0.1461, |
| "mean_token_accuracy": 0.9510073781013488, |
| "step": 275 |
| }, |
| { |
| "epoch": 2.6666666666666665, |
| "grad_norm": 0.18357579016167094, |
| "learning_rate": 1.8588205264118974e-05, |
| "loss": 0.1522, |
| "mean_token_accuracy": 0.9489753544330597, |
| "step": 280 |
| }, |
| { |
| "epoch": 2.7142857142857144, |
| "grad_norm": 0.17455870908089519, |
| "learning_rate": 1.7737615869854944e-05, |
| "loss": 0.1476, |
| "mean_token_accuracy": 0.9504713833332061, |
| "step": 285 |
| }, |
| { |
| "epoch": 2.761904761904762, |
| "grad_norm": 0.1895734172134673, |
| "learning_rate": 1.6903882282545055e-05, |
| "loss": 0.1471, |
| "mean_token_accuracy": 0.9508952736854553, |
| "step": 290 |
| }, |
| { |
| "epoch": 2.8095238095238093, |
| "grad_norm": 0.1814138039625087, |
| "learning_rate": 1.6088444032964923e-05, |
| "loss": 0.1471, |
| "mean_token_accuracy": 0.9506540060043335, |
| "step": 295 |
| }, |
| { |
| "epoch": 2.857142857142857, |
| "grad_norm": 0.18505343239566618, |
| "learning_rate": 1.5292709063020415e-05, |
| "loss": 0.1453, |
| "mean_token_accuracy": 0.9513262331485748, |
| "step": 300 |
| }, |
| { |
| "epoch": 2.9047619047619047, |
| "grad_norm": 0.1818729025239862, |
| "learning_rate": 1.4518051294784384e-05, |
| "loss": 0.1427, |
| "mean_token_accuracy": 0.9522208392620086, |
| "step": 305 |
| }, |
| { |
| "epoch": 2.9523809523809526, |
| "grad_norm": 0.18282477277115136, |
| "learning_rate": 1.3765808258272334e-05, |
| "loss": 0.1457, |
| "mean_token_accuracy": 0.9513007164001465, |
| "step": 310 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 0.17250893276801252, |
| "learning_rate": 1.3037278782052863e-05, |
| "loss": 0.1448, |
| "mean_token_accuracy": 0.9508862257003784, |
| "step": 315 |
| }, |
| { |
| "epoch": 3.0476190476190474, |
| "grad_norm": 0.21843020721408246, |
| "learning_rate": 1.2333720750680403e-05, |
| "loss": 0.0972, |
| "mean_token_accuracy": 0.9678010582923889, |
| "step": 320 |
| }, |
| { |
| "epoch": 3.0952380952380953, |
| "grad_norm": 0.18713956774840773, |
| "learning_rate": 1.1656348932822086e-05, |
| "loss": 0.0951, |
| "mean_token_accuracy": 0.9680294811725616, |
| "step": 325 |
| }, |
| { |
| "epoch": 3.142857142857143, |
| "grad_norm": 0.17099224072656777, |
| "learning_rate": 1.1006332883828913e-05, |
| "loss": 0.0928, |
| "mean_token_accuracy": 0.9688285231590271, |
| "step": 330 |
| }, |
| { |
| "epoch": 3.1904761904761907, |
| "grad_norm": 0.1889152084893897, |
| "learning_rate": 1.0384794926372563e-05, |
| "loss": 0.0945, |
| "mean_token_accuracy": 0.9682107090950012, |
| "step": 335 |
| }, |
| { |
| "epoch": 3.238095238095238, |
| "grad_norm": 0.16909157911032757, |
| "learning_rate": 9.792808212634502e-06, |
| "loss": 0.0911, |
| "mean_token_accuracy": 0.9692863464355469, |
| "step": 340 |
| }, |
| { |
| "epoch": 3.2857142857142856, |
| "grad_norm": 0.16462195486055214, |
| "learning_rate": 9.231394871393228e-06, |
| "loss": 0.0926, |
| "mean_token_accuracy": 0.9688502609729767, |
| "step": 345 |
| }, |
| { |
| "epoch": 3.3333333333333335, |
| "grad_norm": 0.16694354195202218, |
| "learning_rate": 8.701524243208935e-06, |
| "loss": 0.0941, |
| "mean_token_accuracy": 0.968300586938858, |
| "step": 350 |
| }, |
| { |
| "epoch": 3.380952380952381, |
| "grad_norm": 0.1692222626696623, |
| "learning_rate": 8.204111206752663e-06, |
| "loss": 0.0918, |
| "mean_token_accuracy": 0.9689784705638885, |
| "step": 355 |
| }, |
| { |
| "epoch": 3.4285714285714284, |
| "grad_norm": 0.17221192973282243, |
| "learning_rate": 7.740014599169857e-06, |
| "loss": 0.0904, |
| "mean_token_accuracy": 0.9694355607032776, |
| "step": 360 |
| }, |
| { |
| "epoch": 3.4761904761904763, |
| "grad_norm": 0.1741810830817914, |
| "learning_rate": 7.3100357332055624e-06, |
| "loss": 0.0896, |
| "mean_token_accuracy": 0.9696675717830658, |
| "step": 365 |
| }, |
| { |
| "epoch": 3.5238095238095237, |
| "grad_norm": 0.17501221405018258, |
| "learning_rate": 6.914917013651723e-06, |
| "loss": 0.092, |
| "mean_token_accuracy": 0.968971711397171, |
| "step": 370 |
| }, |
| { |
| "epoch": 3.571428571428571, |
| "grad_norm": 0.16570291902290668, |
| "learning_rate": 6.555340655505407e-06, |
| "loss": 0.0917, |
| "mean_token_accuracy": 0.9693442165851593, |
| "step": 375 |
| }, |
| { |
| "epoch": 3.619047619047619, |
| "grad_norm": 0.15923829992676775, |
| "learning_rate": 6.231927506051192e-06, |
| "loss": 0.0915, |
| "mean_token_accuracy": 0.969234949350357, |
| "step": 380 |
| }, |
| { |
| "epoch": 3.6666666666666665, |
| "grad_norm": 0.17338149522120277, |
| "learning_rate": 5.9452359729015004e-06, |
| "loss": 0.0907, |
| "mean_token_accuracy": 0.9692964613437652, |
| "step": 385 |
| }, |
| { |
| "epoch": 3.7142857142857144, |
| "grad_norm": 0.17158993366626976, |
| "learning_rate": 5.695761059845749e-06, |
| "loss": 0.0902, |
| "mean_token_accuracy": 0.9696565389633178, |
| "step": 390 |
| }, |
| { |
| "epoch": 3.761904761904762, |
| "grad_norm": 0.17688545350709856, |
| "learning_rate": 5.483933512173022e-06, |
| "loss": 0.0918, |
| "mean_token_accuracy": 0.9692005813121796, |
| "step": 395 |
| }, |
| { |
| "epoch": 3.8095238095238093, |
| "grad_norm": 0.1621556558547238, |
| "learning_rate": 5.310119072943991e-06, |
| "loss": 0.0892, |
| "mean_token_accuracy": 0.9700611054897308, |
| "step": 400 |
| }, |
| { |
| "epoch": 3.857142857142857, |
| "grad_norm": 0.16839822104636895, |
| "learning_rate": 5.174617851496128e-06, |
| "loss": 0.0904, |
| "mean_token_accuracy": 0.9694457828998566, |
| "step": 405 |
| }, |
| { |
| "epoch": 3.9047619047619047, |
| "grad_norm": 0.16305557552247782, |
| "learning_rate": 5.077663805272652e-06, |
| "loss": 0.0902, |
| "mean_token_accuracy": 0.9695923388004303, |
| "step": 410 |
| }, |
| { |
| "epoch": 3.9523809523809526, |
| "grad_norm": 0.16067720695857812, |
| "learning_rate": 5.019424335869808e-06, |
| "loss": 0.0903, |
| "mean_token_accuracy": 0.9695243299007416, |
| "step": 415 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 0.1603132269708069, |
| "learning_rate": 5e-06, |
| "loss": 0.0903, |
| "mean_token_accuracy": 0.9692174196243286, |
| "step": 420 |
| }, |
| { |
| "epoch": 4.0, |
| "step": 420, |
| "total_flos": 216761967837184.0, |
| "train_loss": 0.20915592369579133, |
| "train_runtime": 2440.0132, |
| "train_samples_per_second": 2.749, |
| "train_steps_per_second": 0.172 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 420, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 4, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 216761967837184.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|